1 | //===- SampleProfile.cpp - Incorporate sample profiles into the IR --------===// |
2 | // |
3 | // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. |
4 | // See https://llvm.org/LICENSE.txt for license information. |
5 | // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception |
6 | // |
7 | //===----------------------------------------------------------------------===// |
8 | // |
9 | // This file implements the SampleProfileLoader transformation. This pass |
10 | // reads a profile file generated by a sampling profiler (e.g. Linux Perf - |
11 | // http://perf.wiki.kernel.org/) and generates IR metadata to reflect the |
12 | // profile information in the given profile. |
13 | // |
14 | // This pass generates branch weight annotations on the IR: |
15 | // |
16 | // - prof: Represents branch weights. This annotation is added to branches |
17 | // to indicate the weights of each edge coming out of the branch. |
18 | // The weight of each edge is the weight of the target block for |
19 | // that edge. The weight of a block B is computed as the maximum |
20 | // number of samples found in B. |
21 | // |
22 | //===----------------------------------------------------------------------===// |
23 | |
24 | #include "llvm/Transforms/IPO/SampleProfile.h" |
25 | #include "llvm/ADT/ArrayRef.h" |
26 | #include "llvm/ADT/DenseMap.h" |
27 | #include "llvm/ADT/DenseSet.h" |
28 | #include "llvm/ADT/MapVector.h" |
29 | #include "llvm/ADT/PriorityQueue.h" |
30 | #include "llvm/ADT/SCCIterator.h" |
31 | #include "llvm/ADT/SmallVector.h" |
32 | #include "llvm/ADT/Statistic.h" |
33 | #include "llvm/ADT/StringMap.h" |
34 | #include "llvm/ADT/StringRef.h" |
35 | #include "llvm/ADT/Twine.h" |
36 | #include "llvm/Analysis/AssumptionCache.h" |
37 | #include "llvm/Analysis/BlockFrequencyInfoImpl.h" |
38 | #include "llvm/Analysis/InlineAdvisor.h" |
39 | #include "llvm/Analysis/InlineCost.h" |
40 | #include "llvm/Analysis/LazyCallGraph.h" |
41 | #include "llvm/Analysis/OptimizationRemarkEmitter.h" |
42 | #include "llvm/Analysis/ProfileSummaryInfo.h" |
43 | #include "llvm/Analysis/ReplayInlineAdvisor.h" |
44 | #include "llvm/Analysis/TargetLibraryInfo.h" |
45 | #include "llvm/Analysis/TargetTransformInfo.h" |
46 | #include "llvm/IR/BasicBlock.h" |
47 | #include "llvm/IR/DebugLoc.h" |
48 | #include "llvm/IR/DiagnosticInfo.h" |
49 | #include "llvm/IR/Function.h" |
50 | #include "llvm/IR/GlobalValue.h" |
51 | #include "llvm/IR/InstrTypes.h" |
52 | #include "llvm/IR/Instruction.h" |
53 | #include "llvm/IR/Instructions.h" |
54 | #include "llvm/IR/IntrinsicInst.h" |
55 | #include "llvm/IR/LLVMContext.h" |
56 | #include "llvm/IR/MDBuilder.h" |
57 | #include "llvm/IR/Module.h" |
58 | #include "llvm/IR/PassManager.h" |
59 | #include "llvm/IR/ProfDataUtils.h" |
60 | #include "llvm/IR/PseudoProbe.h" |
61 | #include "llvm/IR/ValueSymbolTable.h" |
62 | #include "llvm/ProfileData/InstrProf.h" |
63 | #include "llvm/ProfileData/SampleProf.h" |
64 | #include "llvm/ProfileData/SampleProfReader.h" |
65 | #include "llvm/Support/Casting.h" |
66 | #include "llvm/Support/CommandLine.h" |
67 | #include "llvm/Support/Debug.h" |
68 | #include "llvm/Support/ErrorOr.h" |
69 | #include "llvm/Support/VirtualFileSystem.h" |
70 | #include "llvm/Support/raw_ostream.h" |
71 | #include "llvm/Transforms/IPO.h" |
72 | #include "llvm/Transforms/IPO/ProfiledCallGraph.h" |
73 | #include "llvm/Transforms/IPO/SampleContextTracker.h" |
74 | #include "llvm/Transforms/IPO/SampleProfileMatcher.h" |
75 | #include "llvm/Transforms/IPO/SampleProfileProbe.h" |
76 | #include "llvm/Transforms/Instrumentation.h" |
77 | #include "llvm/Transforms/Utils/CallPromotionUtils.h" |
78 | #include "llvm/Transforms/Utils/Cloning.h" |
79 | #include "llvm/Transforms/Utils/MisExpect.h" |
80 | #include "llvm/Transforms/Utils/SampleProfileLoaderBaseImpl.h" |
81 | #include "llvm/Transforms/Utils/SampleProfileLoaderBaseUtil.h" |
82 | #include <algorithm> |
83 | #include <cassert> |
84 | #include <cstdint> |
85 | #include <functional> |
86 | #include <limits> |
87 | #include <map> |
88 | #include <memory> |
89 | #include <queue> |
90 | #include <string> |
91 | #include <system_error> |
92 | #include <utility> |
93 | #include <vector> |
94 | |
95 | using namespace llvm; |
96 | using namespace sampleprof; |
97 | using namespace llvm::sampleprofutil; |
98 | using ProfileCount = Function::ProfileCount; |
99 | #define DEBUG_TYPE "sample-profile" |
100 | #define CSINLINE_DEBUG DEBUG_TYPE "-inline" |
101 | |
102 | STATISTIC(NumCSInlined, |
103 | "Number of functions inlined with context sensitive profile" ); |
104 | STATISTIC(NumCSNotInlined, |
105 | "Number of functions not inlined with context sensitive profile" ); |
106 | STATISTIC(NumMismatchedProfile, |
107 | "Number of functions with CFG mismatched profile" ); |
108 | STATISTIC(NumMatchedProfile, "Number of functions with CFG matched profile" ); |
109 | STATISTIC(NumDuplicatedInlinesite, |
110 | "Number of inlined callsites with a partial distribution factor" ); |
111 | |
112 | STATISTIC(NumCSInlinedHitMinLimit, |
113 | "Number of functions with FDO inline stopped due to min size limit" ); |
114 | STATISTIC(NumCSInlinedHitMaxLimit, |
115 | "Number of functions with FDO inline stopped due to max size limit" ); |
116 | STATISTIC( |
117 | NumCSInlinedHitGrowthLimit, |
118 | "Number of functions with FDO inline stopped due to growth size limit" ); |
119 | |
120 | // Command line option to specify the file to read samples from. This is |
121 | // mainly used for debugging. |
122 | static cl::opt<std::string> SampleProfileFile( |
123 | "sample-profile-file" , cl::init(Val: "" ), cl::value_desc("filename" ), |
124 | cl::desc("Profile file loaded by -sample-profile" ), cl::Hidden); |
125 | |
126 | // The named file contains a set of transformations that may have been applied |
127 | // to the symbol names between the program from which the sample data was |
128 | // collected and the current program's symbols. |
129 | static cl::opt<std::string> SampleProfileRemappingFile( |
130 | "sample-profile-remapping-file" , cl::init(Val: "" ), cl::value_desc("filename" ), |
131 | cl::desc("Profile remapping file loaded by -sample-profile" ), cl::Hidden); |
132 | |
133 | cl::opt<bool> SalvageStaleProfile( |
134 | "salvage-stale-profile" , cl::Hidden, cl::init(Val: false), |
135 | cl::desc("Salvage stale profile by fuzzy matching and use the remapped " |
136 | "location for sample profile query." )); |
137 | |
138 | cl::opt<bool> ReportProfileStaleness( |
139 | "report-profile-staleness" , cl::Hidden, cl::init(Val: false), |
140 | cl::desc("Compute and report stale profile statistical metrics." )); |
141 | |
142 | cl::opt<bool> PersistProfileStaleness( |
143 | "persist-profile-staleness" , cl::Hidden, cl::init(Val: false), |
144 | cl::desc("Compute stale profile statistical metrics and write it into the " |
145 | "native object file(.llvm_stats section)." )); |
146 | |
147 | static cl::opt<bool> ProfileSampleAccurate( |
148 | "profile-sample-accurate" , cl::Hidden, cl::init(Val: false), |
149 | cl::desc("If the sample profile is accurate, we will mark all un-sampled " |
150 | "callsite and function as having 0 samples. Otherwise, treat " |
151 | "un-sampled callsites and functions conservatively as unknown. " )); |
152 | |
153 | static cl::opt<bool> ProfileSampleBlockAccurate( |
154 | "profile-sample-block-accurate" , cl::Hidden, cl::init(Val: false), |
155 | cl::desc("If the sample profile is accurate, we will mark all un-sampled " |
156 | "branches and calls as having 0 samples. Otherwise, treat " |
157 | "them conservatively as unknown. " )); |
158 | |
159 | static cl::opt<bool> ProfileAccurateForSymsInList( |
160 | "profile-accurate-for-symsinlist" , cl::Hidden, cl::init(Val: true), |
161 | cl::desc("For symbols in profile symbol list, regard their profiles to " |
162 | "be accurate. It may be overriden by profile-sample-accurate. " )); |
163 | |
164 | static cl::opt<bool> ProfileMergeInlinee( |
165 | "sample-profile-merge-inlinee" , cl::Hidden, cl::init(Val: true), |
166 | cl::desc("Merge past inlinee's profile to outline version if sample " |
167 | "profile loader decided not to inline a call site. It will " |
168 | "only be enabled when top-down order of profile loading is " |
169 | "enabled. " )); |
170 | |
171 | static cl::opt<bool> ProfileTopDownLoad( |
172 | "sample-profile-top-down-load" , cl::Hidden, cl::init(Val: true), |
173 | cl::desc("Do profile annotation and inlining for functions in top-down " |
174 | "order of call graph during sample profile loading. It only " |
175 | "works for new pass manager. " )); |
176 | |
177 | static cl::opt<bool> |
178 | UseProfiledCallGraph("use-profiled-call-graph" , cl::init(Val: true), cl::Hidden, |
179 | cl::desc("Process functions in a top-down order " |
180 | "defined by the profiled call graph when " |
181 | "-sample-profile-top-down-load is on." )); |
182 | |
183 | static cl::opt<bool> ProfileSizeInline( |
184 | "sample-profile-inline-size" , cl::Hidden, cl::init(Val: false), |
185 | cl::desc("Inline cold call sites in profile loader if it's beneficial " |
186 | "for code size." )); |
187 | |
188 | // Since profiles are consumed by many passes, turning on this option has |
189 | // side effects. For instance, pre-link SCC inliner would see merged profiles |
190 | // and inline the hot functions (that are skipped in this pass). |
191 | static cl::opt<bool> DisableSampleLoaderInlining( |
192 | "disable-sample-loader-inlining" , cl::Hidden, cl::init(Val: false), |
193 | cl::desc("If true, artifically skip inline transformation in sample-loader " |
194 | "pass, and merge (or scale) profiles (as configured by " |
195 | "--sample-profile-merge-inlinee)." )); |
196 | |
197 | namespace llvm { |
198 | cl::opt<bool> |
199 | SortProfiledSCC("sort-profiled-scc-member" , cl::init(Val: true), cl::Hidden, |
200 | cl::desc("Sort profiled recursion by edge weights." )); |
201 | |
202 | cl::opt<int> ProfileInlineGrowthLimit( |
203 | "sample-profile-inline-growth-limit" , cl::Hidden, cl::init(Val: 12), |
204 | cl::desc("The size growth ratio limit for proirity-based sample profile " |
205 | "loader inlining." )); |
206 | |
207 | cl::opt<int> ProfileInlineLimitMin( |
208 | "sample-profile-inline-limit-min" , cl::Hidden, cl::init(Val: 100), |
209 | cl::desc("The lower bound of size growth limit for " |
210 | "proirity-based sample profile loader inlining." )); |
211 | |
212 | cl::opt<int> ProfileInlineLimitMax( |
213 | "sample-profile-inline-limit-max" , cl::Hidden, cl::init(Val: 10000), |
214 | cl::desc("The upper bound of size growth limit for " |
215 | "proirity-based sample profile loader inlining." )); |
216 | |
217 | cl::opt<int> SampleHotCallSiteThreshold( |
218 | "sample-profile-hot-inline-threshold" , cl::Hidden, cl::init(Val: 3000), |
219 | cl::desc("Hot callsite threshold for proirity-based sample profile loader " |
220 | "inlining." )); |
221 | |
222 | cl::opt<int> SampleColdCallSiteThreshold( |
223 | "sample-profile-cold-inline-threshold" , cl::Hidden, cl::init(Val: 45), |
224 | cl::desc("Threshold for inlining cold callsites" )); |
225 | } // namespace llvm |
226 | |
227 | static cl::opt<unsigned> ProfileICPRelativeHotness( |
228 | "sample-profile-icp-relative-hotness" , cl::Hidden, cl::init(Val: 25), |
229 | cl::desc( |
230 | "Relative hotness percentage threshold for indirect " |
231 | "call promotion in proirity-based sample profile loader inlining." )); |
232 | |
233 | static cl::opt<unsigned> ProfileICPRelativeHotnessSkip( |
234 | "sample-profile-icp-relative-hotness-skip" , cl::Hidden, cl::init(Val: 1), |
235 | cl::desc( |
236 | "Skip relative hotness check for ICP up to given number of targets." )); |
237 | |
238 | static cl::opt<unsigned> HotFuncCutoffForStalenessError( |
239 | "hot-func-cutoff-for-staleness-error" , cl::Hidden, cl::init(Val: 800000), |
240 | cl::desc("A function is considered hot for staleness error check if its " |
241 | "total sample count is above the specified percentile" )); |
242 | |
243 | static cl::opt<unsigned> MinfuncsForStalenessError( |
244 | "min-functions-for-staleness-error" , cl::Hidden, cl::init(Val: 50), |
245 | cl::desc("Skip the check if the number of hot functions is smaller than " |
246 | "the specified number." )); |
247 | |
248 | static cl::opt<unsigned> PrecentMismatchForStalenessError( |
249 | "precent-mismatch-for-staleness-error" , cl::Hidden, cl::init(Val: 80), |
250 | cl::desc("Reject the profile if the mismatch percent is higher than the " |
251 | "given number." )); |
252 | |
253 | static cl::opt<bool> CallsitePrioritizedInline( |
254 | "sample-profile-prioritized-inline" , cl::Hidden, |
255 | |
256 | cl::desc("Use call site prioritized inlining for sample profile loader." |
257 | "Currently only CSSPGO is supported." )); |
258 | |
259 | static cl::opt<bool> UsePreInlinerDecision( |
260 | "sample-profile-use-preinliner" , cl::Hidden, |
261 | |
262 | cl::desc("Use the preinliner decisions stored in profile context." )); |
263 | |
264 | static cl::opt<bool> AllowRecursiveInline( |
265 | "sample-profile-recursive-inline" , cl::Hidden, |
266 | |
267 | cl::desc("Allow sample loader inliner to inline recursive calls." )); |
268 | |
269 | static cl::opt<std::string> ProfileInlineReplayFile( |
270 | "sample-profile-inline-replay" , cl::init(Val: "" ), cl::value_desc("filename" ), |
271 | cl::desc( |
272 | "Optimization remarks file containing inline remarks to be replayed " |
273 | "by inlining from sample profile loader." ), |
274 | cl::Hidden); |
275 | |
276 | static cl::opt<ReplayInlinerSettings::Scope> ProfileInlineReplayScope( |
277 | "sample-profile-inline-replay-scope" , |
278 | cl::init(Val: ReplayInlinerSettings::Scope::Function), |
279 | cl::values(clEnumValN(ReplayInlinerSettings::Scope::Function, "Function" , |
280 | "Replay on functions that have remarks associated " |
281 | "with them (default)" ), |
282 | clEnumValN(ReplayInlinerSettings::Scope::Module, "Module" , |
283 | "Replay on the entire module" )), |
284 | cl::desc("Whether inline replay should be applied to the entire " |
285 | "Module or just the Functions (default) that are present as " |
286 | "callers in remarks during sample profile inlining." ), |
287 | cl::Hidden); |
288 | |
289 | static cl::opt<ReplayInlinerSettings::Fallback> ProfileInlineReplayFallback( |
290 | "sample-profile-inline-replay-fallback" , |
291 | cl::init(Val: ReplayInlinerSettings::Fallback::Original), |
292 | cl::values( |
293 | clEnumValN( |
294 | ReplayInlinerSettings::Fallback::Original, "Original" , |
295 | "All decisions not in replay send to original advisor (default)" ), |
296 | clEnumValN(ReplayInlinerSettings::Fallback::AlwaysInline, |
297 | "AlwaysInline" , "All decisions not in replay are inlined" ), |
298 | clEnumValN(ReplayInlinerSettings::Fallback::NeverInline, "NeverInline" , |
299 | "All decisions not in replay are not inlined" )), |
300 | cl::desc("How sample profile inline replay treats sites that don't come " |
301 | "from the replay. Original: defers to original advisor, " |
302 | "AlwaysInline: inline all sites not in replay, NeverInline: " |
303 | "inline no sites not in replay" ), |
304 | cl::Hidden); |
305 | |
306 | static cl::opt<CallSiteFormat::Format> ProfileInlineReplayFormat( |
307 | "sample-profile-inline-replay-format" , |
308 | cl::init(Val: CallSiteFormat::Format::LineColumnDiscriminator), |
309 | cl::values( |
310 | clEnumValN(CallSiteFormat::Format::Line, "Line" , "<Line Number>" ), |
311 | clEnumValN(CallSiteFormat::Format::LineColumn, "LineColumn" , |
312 | "<Line Number>:<Column Number>" ), |
313 | clEnumValN(CallSiteFormat::Format::LineDiscriminator, |
314 | "LineDiscriminator" , "<Line Number>.<Discriminator>" ), |
315 | clEnumValN(CallSiteFormat::Format::LineColumnDiscriminator, |
316 | "LineColumnDiscriminator" , |
317 | "<Line Number>:<Column Number>.<Discriminator> (default)" )), |
318 | cl::desc("How sample profile inline replay file is formatted" ), cl::Hidden); |
319 | |
320 | static cl::opt<unsigned> |
321 | MaxNumPromotions("sample-profile-icp-max-prom" , cl::init(Val: 3), cl::Hidden, |
322 | cl::desc("Max number of promotions for a single indirect " |
323 | "call callsite in sample profile loader" )); |
324 | |
325 | static cl::opt<bool> OverwriteExistingWeights( |
326 | "overwrite-existing-weights" , cl::Hidden, cl::init(Val: false), |
327 | cl::desc("Ignore existing branch weights on IR and always overwrite." )); |
328 | |
329 | static cl::opt<bool> AnnotateSampleProfileInlinePhase( |
330 | "annotate-sample-profile-inline-phase" , cl::Hidden, cl::init(Val: false), |
331 | cl::desc("Annotate LTO phase (prelink / postlink), or main (no LTO) for " |
332 | "sample-profile inline pass name." )); |
333 | |
334 | namespace llvm { |
335 | extern cl::opt<bool> EnableExtTspBlockPlacement; |
336 | } |
337 | |
338 | namespace { |
339 | |
340 | using BlockWeightMap = DenseMap<const BasicBlock *, uint64_t>; |
341 | using EquivalenceClassMap = DenseMap<const BasicBlock *, const BasicBlock *>; |
342 | using Edge = std::pair<const BasicBlock *, const BasicBlock *>; |
343 | using EdgeWeightMap = DenseMap<Edge, uint64_t>; |
344 | using BlockEdgeMap = |
345 | DenseMap<const BasicBlock *, SmallVector<const BasicBlock *, 8>>; |
346 | |
347 | class GUIDToFuncNameMapper { |
348 | public: |
349 | GUIDToFuncNameMapper(Module &M, SampleProfileReader &Reader, |
350 | DenseMap<uint64_t, StringRef> &GUIDToFuncNameMap) |
351 | : CurrentReader(Reader), CurrentModule(M), |
352 | CurrentGUIDToFuncNameMap(GUIDToFuncNameMap) { |
353 | if (!CurrentReader.useMD5()) |
354 | return; |
355 | |
356 | for (const auto &F : CurrentModule) { |
357 | StringRef OrigName = F.getName(); |
358 | CurrentGUIDToFuncNameMap.insert( |
359 | KV: {Function::getGUID(GlobalName: OrigName), OrigName}); |
360 | |
361 | // Local to global var promotion used by optimization like thinlto |
362 | // will rename the var and add suffix like ".llvm.xxx" to the |
363 | // original local name. In sample profile, the suffixes of function |
364 | // names are all stripped. Since it is possible that the mapper is |
365 | // built in post-thin-link phase and var promotion has been done, |
366 | // we need to add the substring of function name without the suffix |
367 | // into the GUIDToFuncNameMap. |
368 | StringRef CanonName = FunctionSamples::getCanonicalFnName(F); |
369 | if (CanonName != OrigName) |
370 | CurrentGUIDToFuncNameMap.insert( |
371 | KV: {Function::getGUID(GlobalName: CanonName), CanonName}); |
372 | } |
373 | |
374 | // Update GUIDToFuncNameMap for each function including inlinees. |
375 | SetGUIDToFuncNameMapForAll(&CurrentGUIDToFuncNameMap); |
376 | } |
377 | |
378 | ~GUIDToFuncNameMapper() { |
379 | if (!CurrentReader.useMD5()) |
380 | return; |
381 | |
382 | CurrentGUIDToFuncNameMap.clear(); |
383 | |
384 | // Reset GUIDToFuncNameMap for of each function as they're no |
385 | // longer valid at this point. |
386 | SetGUIDToFuncNameMapForAll(nullptr); |
387 | } |
388 | |
389 | private: |
390 | void SetGUIDToFuncNameMapForAll(DenseMap<uint64_t, StringRef> *Map) { |
391 | std::queue<FunctionSamples *> FSToUpdate; |
392 | for (auto &IFS : CurrentReader.getProfiles()) { |
393 | FSToUpdate.push(x: &IFS.second); |
394 | } |
395 | |
396 | while (!FSToUpdate.empty()) { |
397 | FunctionSamples *FS = FSToUpdate.front(); |
398 | FSToUpdate.pop(); |
399 | FS->GUIDToFuncNameMap = Map; |
400 | for (const auto &ICS : FS->getCallsiteSamples()) { |
401 | const FunctionSamplesMap &FSMap = ICS.second; |
402 | for (const auto &IFS : FSMap) { |
403 | FunctionSamples &FS = const_cast<FunctionSamples &>(IFS.second); |
404 | FSToUpdate.push(x: &FS); |
405 | } |
406 | } |
407 | } |
408 | } |
409 | |
410 | SampleProfileReader &CurrentReader; |
411 | Module &CurrentModule; |
412 | DenseMap<uint64_t, StringRef> &CurrentGUIDToFuncNameMap; |
413 | }; |
414 | |
415 | // Inline candidate used by iterative callsite prioritized inliner |
416 | struct InlineCandidate { |
417 | CallBase *CallInstr; |
418 | const FunctionSamples *CalleeSamples; |
419 | // Prorated callsite count, which will be used to guide inlining. For example, |
420 | // if a callsite is duplicated in LTO prelink, then in LTO postlink the two |
421 | // copies will get their own distribution factors and their prorated counts |
422 | // will be used to decide if they should be inlined independently. |
423 | uint64_t CallsiteCount; |
424 | // Call site distribution factor to prorate the profile samples for a |
425 | // duplicated callsite. Default value is 1.0. |
426 | float CallsiteDistribution; |
427 | }; |
428 | |
429 | // Inline candidate comparer using call site weight |
430 | struct CandidateComparer { |
431 | bool operator()(const InlineCandidate &LHS, const InlineCandidate &RHS) { |
432 | if (LHS.CallsiteCount != RHS.CallsiteCount) |
433 | return LHS.CallsiteCount < RHS.CallsiteCount; |
434 | |
435 | const FunctionSamples *LCS = LHS.CalleeSamples; |
436 | const FunctionSamples *RCS = RHS.CalleeSamples; |
437 | assert(LCS && RCS && "Expect non-null FunctionSamples" ); |
438 | |
439 | // Tie breaker using number of samples try to favor smaller functions first |
440 | if (LCS->getBodySamples().size() != RCS->getBodySamples().size()) |
441 | return LCS->getBodySamples().size() > RCS->getBodySamples().size(); |
442 | |
443 | // Tie breaker using GUID so we have stable/deterministic inlining order |
444 | return LCS->getGUID() < RCS->getGUID(); |
445 | } |
446 | }; |
447 | |
448 | using CandidateQueue = |
449 | PriorityQueue<InlineCandidate, std::vector<InlineCandidate>, |
450 | CandidateComparer>; |
451 | |
452 | /// Sample profile pass. |
453 | /// |
454 | /// This pass reads profile data from the file specified by |
455 | /// -sample-profile-file and annotates every affected function with the |
456 | /// profile information found in that file. |
457 | class SampleProfileLoader final : public SampleProfileLoaderBaseImpl<Function> { |
458 | public: |
459 | SampleProfileLoader( |
460 | StringRef Name, StringRef RemapName, ThinOrFullLTOPhase LTOPhase, |
461 | IntrusiveRefCntPtr<vfs::FileSystem> FS, |
462 | std::function<AssumptionCache &(Function &)> GetAssumptionCache, |
463 | std::function<TargetTransformInfo &(Function &)> GetTargetTransformInfo, |
464 | std::function<const TargetLibraryInfo &(Function &)> GetTLI) |
465 | : SampleProfileLoaderBaseImpl(std::string(Name), std::string(RemapName), |
466 | std::move(FS)), |
467 | GetAC(std::move(GetAssumptionCache)), |
468 | GetTTI(std::move(GetTargetTransformInfo)), GetTLI(std::move(GetTLI)), |
469 | LTOPhase(LTOPhase), |
470 | AnnotatedPassName(AnnotateSampleProfileInlinePhase |
471 | ? llvm::AnnotateInlinePassName(IC: InlineContext{ |
472 | .LTOPhase: LTOPhase, .Pass: InlinePass::SampleProfileInliner}) |
473 | : CSINLINE_DEBUG) {} |
474 | |
475 | bool doInitialization(Module &M, FunctionAnalysisManager *FAM = nullptr); |
476 | bool runOnModule(Module &M, ModuleAnalysisManager *AM, |
477 | ProfileSummaryInfo *_PSI, LazyCallGraph &CG); |
478 | |
479 | protected: |
480 | bool runOnFunction(Function &F, ModuleAnalysisManager *AM); |
481 | bool emitAnnotations(Function &F); |
482 | ErrorOr<uint64_t> getInstWeight(const Instruction &I) override; |
483 | const FunctionSamples *findCalleeFunctionSamples(const CallBase &I) const; |
484 | const FunctionSamples * |
485 | findFunctionSamples(const Instruction &I) const override; |
486 | std::vector<const FunctionSamples *> |
487 | findIndirectCallFunctionSamples(const Instruction &I, uint64_t &Sum) const; |
488 | void findExternalInlineCandidate(CallBase *CB, const FunctionSamples *Samples, |
489 | DenseSet<GlobalValue::GUID> &InlinedGUIDs, |
490 | uint64_t Threshold); |
491 | // Attempt to promote indirect call and also inline the promoted call |
492 | bool tryPromoteAndInlineCandidate( |
493 | Function &F, InlineCandidate &Candidate, uint64_t SumOrigin, |
494 | uint64_t &Sum, SmallVector<CallBase *, 8> *InlinedCallSites = nullptr); |
495 | |
496 | bool inlineHotFunctions(Function &F, |
497 | DenseSet<GlobalValue::GUID> &InlinedGUIDs); |
498 | std::optional<InlineCost> getExternalInlineAdvisorCost(CallBase &CB); |
499 | bool getExternalInlineAdvisorShouldInline(CallBase &CB); |
500 | InlineCost shouldInlineCandidate(InlineCandidate &Candidate); |
501 | bool getInlineCandidate(InlineCandidate *NewCandidate, CallBase *CB); |
502 | bool |
503 | tryInlineCandidate(InlineCandidate &Candidate, |
504 | SmallVector<CallBase *, 8> *InlinedCallSites = nullptr); |
505 | bool |
506 | inlineHotFunctionsWithPriority(Function &F, |
507 | DenseSet<GlobalValue::GUID> &InlinedGUIDs); |
508 | // Inline cold/small functions in addition to hot ones |
509 | bool shouldInlineColdCallee(CallBase &CallInst); |
510 | void emitOptimizationRemarksForInlineCandidates( |
511 | const SmallVectorImpl<CallBase *> &Candidates, const Function &F, |
512 | bool Hot); |
513 | void promoteMergeNotInlinedContextSamples( |
514 | MapVector<CallBase *, const FunctionSamples *> NonInlinedCallSites, |
515 | const Function &F); |
516 | std::vector<Function *> buildFunctionOrder(Module &M, LazyCallGraph &CG); |
517 | std::unique_ptr<ProfiledCallGraph> buildProfiledCallGraph(Module &M); |
518 | void generateMDProfMetadata(Function &F); |
519 | bool rejectHighStalenessProfile(Module &M, ProfileSummaryInfo *PSI, |
520 | const SampleProfileMap &Profiles); |
521 | |
522 | /// Map from function name to Function *. Used to find the function from |
523 | /// the function name. If the function name contains suffix, additional |
524 | /// entry is added to map from the stripped name to the function if there |
525 | /// is one-to-one mapping. |
526 | HashKeyMap<std::unordered_map, FunctionId, Function *> SymbolMap; |
527 | |
528 | std::function<AssumptionCache &(Function &)> GetAC; |
529 | std::function<TargetTransformInfo &(Function &)> GetTTI; |
530 | std::function<const TargetLibraryInfo &(Function &)> GetTLI; |
531 | |
532 | /// Profile tracker for different context. |
533 | std::unique_ptr<SampleContextTracker> ContextTracker; |
534 | |
535 | /// Flag indicating which LTO/ThinLTO phase the pass is invoked in. |
536 | /// |
537 | /// We need to know the LTO phase because for example in ThinLTOPrelink |
538 | /// phase, in annotation, we should not promote indirect calls. Instead, |
539 | /// we will mark GUIDs that needs to be annotated to the function. |
540 | const ThinOrFullLTOPhase LTOPhase; |
541 | const std::string AnnotatedPassName; |
542 | |
543 | /// Profle Symbol list tells whether a function name appears in the binary |
544 | /// used to generate the current profile. |
545 | std::unique_ptr<ProfileSymbolList> PSL; |
546 | |
547 | /// Total number of samples collected in this profile. |
548 | /// |
549 | /// This is the sum of all the samples collected in all the functions executed |
550 | /// at runtime. |
551 | uint64_t TotalCollectedSamples = 0; |
552 | |
553 | // Information recorded when we declined to inline a call site |
554 | // because we have determined it is too cold is accumulated for |
555 | // each callee function. Initially this is just the entry count. |
556 | struct NotInlinedProfileInfo { |
557 | uint64_t entryCount; |
558 | }; |
559 | DenseMap<Function *, NotInlinedProfileInfo> notInlinedCallInfo; |
560 | |
561 | // GUIDToFuncNameMap saves the mapping from GUID to the symbol name, for |
562 | // all the function symbols defined or declared in current module. |
563 | DenseMap<uint64_t, StringRef> GUIDToFuncNameMap; |
564 | |
565 | // All the Names used in FunctionSamples including outline function |
566 | // names, inline instance names and call target names. |
567 | StringSet<> NamesInProfile; |
568 | // MD5 version of NamesInProfile. Either NamesInProfile or GUIDsInProfile is |
569 | // populated, depends on whether the profile uses MD5. Because the name table |
570 | // generally contains several magnitude more entries than the number of |
571 | // functions, we do not want to convert all names from one form to another. |
572 | llvm::DenseSet<uint64_t> GUIDsInProfile; |
573 | |
574 | // For symbol in profile symbol list, whether to regard their profiles |
575 | // to be accurate. It is mainly decided by existance of profile symbol |
576 | // list and -profile-accurate-for-symsinlist flag, but it can be |
577 | // overriden by -profile-sample-accurate or profile-sample-accurate |
578 | // attribute. |
579 | bool ProfAccForSymsInList; |
580 | |
581 | // External inline advisor used to replay inline decision from remarks. |
582 | std::unique_ptr<InlineAdvisor> ExternalInlineAdvisor; |
583 | |
584 | // A helper to implement the sample profile matching algorithm. |
585 | std::unique_ptr<SampleProfileMatcher> MatchingManager; |
586 | |
587 | private: |
588 | const char *() const { |
589 | return AnnotatedPassName.c_str(); |
590 | } |
591 | }; |
592 | } // end anonymous namespace |
593 | |
594 | namespace llvm { |
595 | template <> |
596 | inline bool SampleProfileInference<Function>::isExit(const BasicBlock *BB) { |
597 | return succ_empty(BB); |
598 | } |
599 | |
600 | template <> |
601 | inline void SampleProfileInference<Function>::findUnlikelyJumps( |
602 | const std::vector<const BasicBlockT *> &BasicBlocks, |
603 | BlockEdgeMap &Successors, FlowFunction &Func) { |
604 | for (auto &Jump : Func.Jumps) { |
605 | const auto *BB = BasicBlocks[Jump.Source]; |
606 | const auto *Succ = BasicBlocks[Jump.Target]; |
607 | const Instruction *TI = BB->getTerminator(); |
608 | // Check if a block ends with InvokeInst and mark non-taken branch unlikely. |
609 | // In that case block Succ should be a landing pad |
610 | if (Successors[BB].size() == 2 && Successors[BB].back() == Succ) { |
611 | if (isa<InvokeInst>(Val: TI)) { |
612 | Jump.IsUnlikely = true; |
613 | } |
614 | } |
615 | const Instruction *SuccTI = Succ->getTerminator(); |
616 | // Check if the target block contains UnreachableInst and mark it unlikely |
617 | if (SuccTI->getNumSuccessors() == 0) { |
618 | if (isa<UnreachableInst>(Val: SuccTI)) { |
619 | Jump.IsUnlikely = true; |
620 | } |
621 | } |
622 | } |
623 | } |
624 | |
625 | template <> |
626 | void SampleProfileLoaderBaseImpl<Function>::computeDominanceAndLoopInfo( |
627 | Function &F) { |
628 | DT.reset(p: new DominatorTree); |
629 | DT->recalculate(Func&: F); |
630 | |
631 | PDT.reset(p: new PostDominatorTree(F)); |
632 | |
633 | LI.reset(p: new LoopInfo); |
634 | LI->analyze(DomTree: *DT); |
635 | } |
636 | } // namespace llvm |
637 | |
638 | ErrorOr<uint64_t> SampleProfileLoader::getInstWeight(const Instruction &Inst) { |
639 | if (FunctionSamples::ProfileIsProbeBased) |
640 | return getProbeWeight(Inst); |
641 | |
642 | const DebugLoc &DLoc = Inst.getDebugLoc(); |
643 | if (!DLoc) |
644 | return std::error_code(); |
645 | |
646 | // Ignore all intrinsics, phinodes and branch instructions. |
647 | // Branch and phinodes instruction usually contains debug info from sources |
648 | // outside of the residing basic block, thus we ignore them during annotation. |
649 | if (isa<BranchInst>(Val: Inst) || isa<IntrinsicInst>(Val: Inst) || isa<PHINode>(Val: Inst)) |
650 | return std::error_code(); |
651 | |
652 | // For non-CS profile, if a direct call/invoke instruction is inlined in |
653 | // profile (findCalleeFunctionSamples returns non-empty result), but not |
654 | // inlined here, it means that the inlined callsite has no sample, thus the |
655 | // call instruction should have 0 count. |
656 | // For CS profile, the callsite count of previously inlined callees is |
657 | // populated with the entry count of the callees. |
658 | if (!FunctionSamples::ProfileIsCS) |
659 | if (const auto *CB = dyn_cast<CallBase>(Val: &Inst)) |
660 | if (!CB->isIndirectCall() && findCalleeFunctionSamples(I: *CB)) |
661 | return 0; |
662 | |
663 | return getInstWeightImpl(Inst); |
664 | } |
665 | |
666 | /// Get the FunctionSamples for a call instruction. |
667 | /// |
668 | /// The FunctionSamples of a call/invoke instruction \p Inst is the inlined |
669 | /// instance in which that call instruction is calling to. It contains |
670 | /// all samples that resides in the inlined instance. We first find the |
671 | /// inlined instance in which the call instruction is from, then we |
672 | /// traverse its children to find the callsite with the matching |
673 | /// location. |
674 | /// |
675 | /// \param Inst Call/Invoke instruction to query. |
676 | /// |
677 | /// \returns The FunctionSamples pointer to the inlined instance. |
678 | const FunctionSamples * |
679 | SampleProfileLoader::findCalleeFunctionSamples(const CallBase &Inst) const { |
680 | const DILocation *DIL = Inst.getDebugLoc(); |
681 | if (!DIL) { |
682 | return nullptr; |
683 | } |
684 | |
685 | StringRef CalleeName; |
686 | if (Function *Callee = Inst.getCalledFunction()) |
687 | CalleeName = Callee->getName(); |
688 | |
689 | if (FunctionSamples::ProfileIsCS) |
690 | return ContextTracker->getCalleeContextSamplesFor(Inst, CalleeName); |
691 | |
692 | const FunctionSamples *FS = findFunctionSamples(I: Inst); |
693 | if (FS == nullptr) |
694 | return nullptr; |
695 | |
696 | return FS->findFunctionSamplesAt(Loc: FunctionSamples::getCallSiteIdentifier(DIL), |
697 | CalleeName, Remapper: Reader->getRemapper()); |
698 | } |
699 | |
700 | /// Returns a vector of FunctionSamples that are the indirect call targets |
701 | /// of \p Inst. The vector is sorted by the total number of samples. Stores |
702 | /// the total call count of the indirect call in \p Sum. |
703 | std::vector<const FunctionSamples *> |
704 | SampleProfileLoader::findIndirectCallFunctionSamples( |
705 | const Instruction &Inst, uint64_t &Sum) const { |
706 | const DILocation *DIL = Inst.getDebugLoc(); |
707 | std::vector<const FunctionSamples *> R; |
708 | |
709 | if (!DIL) { |
710 | return R; |
711 | } |
712 | |
713 | auto FSCompare = [](const FunctionSamples *L, const FunctionSamples *R) { |
714 | assert(L && R && "Expect non-null FunctionSamples" ); |
715 | if (L->getHeadSamplesEstimate() != R->getHeadSamplesEstimate()) |
716 | return L->getHeadSamplesEstimate() > R->getHeadSamplesEstimate(); |
717 | return L->getGUID() < R->getGUID(); |
718 | }; |
719 | |
720 | if (FunctionSamples::ProfileIsCS) { |
721 | auto CalleeSamples = |
722 | ContextTracker->getIndirectCalleeContextSamplesFor(DIL); |
723 | if (CalleeSamples.empty()) |
724 | return R; |
725 | |
726 | // For CSSPGO, we only use target context profile's entry count |
727 | // as that already includes both inlined callee and non-inlined ones.. |
728 | Sum = 0; |
729 | for (const auto *const FS : CalleeSamples) { |
730 | Sum += FS->getHeadSamplesEstimate(); |
731 | R.push_back(x: FS); |
732 | } |
733 | llvm::sort(C&: R, Comp: FSCompare); |
734 | return R; |
735 | } |
736 | |
737 | const FunctionSamples *FS = findFunctionSamples(I: Inst); |
738 | if (FS == nullptr) |
739 | return R; |
740 | |
741 | auto CallSite = FunctionSamples::getCallSiteIdentifier(DIL); |
742 | Sum = 0; |
743 | if (auto T = FS->findCallTargetMapAt(CallSite)) |
744 | for (const auto &T_C : *T) |
745 | Sum += T_C.second; |
746 | if (const FunctionSamplesMap *M = FS->findFunctionSamplesMapAt(Loc: CallSite)) { |
747 | if (M->empty()) |
748 | return R; |
749 | for (const auto &NameFS : *M) { |
750 | Sum += NameFS.second.getHeadSamplesEstimate(); |
751 | R.push_back(x: &NameFS.second); |
752 | } |
753 | llvm::sort(C&: R, Comp: FSCompare); |
754 | } |
755 | return R; |
756 | } |
757 | |
758 | const FunctionSamples * |
759 | SampleProfileLoader::findFunctionSamples(const Instruction &Inst) const { |
760 | if (FunctionSamples::ProfileIsProbeBased) { |
761 | std::optional<PseudoProbe> Probe = extractProbe(Inst); |
762 | if (!Probe) |
763 | return nullptr; |
764 | } |
765 | |
766 | const DILocation *DIL = Inst.getDebugLoc(); |
767 | if (!DIL) |
768 | return Samples; |
769 | |
770 | auto it = DILocation2SampleMap.try_emplace(Key: DIL,Args: nullptr); |
771 | if (it.second) { |
772 | if (FunctionSamples::ProfileIsCS) |
773 | it.first->second = ContextTracker->getContextSamplesFor(DIL); |
774 | else |
775 | it.first->second = |
776 | Samples->findFunctionSamples(DIL, Remapper: Reader->getRemapper()); |
777 | } |
778 | return it.first->second; |
779 | } |
780 | |
781 | /// Check whether the indirect call promotion history of \p Inst allows |
782 | /// the promotion for \p Candidate. |
783 | /// If the profile count for the promotion candidate \p Candidate is |
784 | /// NOMORE_ICP_MAGICNUM, it means \p Candidate has already been promoted |
785 | /// for \p Inst. If we already have at least MaxNumPromotions |
786 | /// NOMORE_ICP_MAGICNUM count values in the value profile of \p Inst, we |
787 | /// cannot promote for \p Inst anymore. |
788 | static bool doesHistoryAllowICP(const Instruction &Inst, StringRef Candidate) { |
789 | uint32_t NumVals = 0; |
790 | uint64_t TotalCount = 0; |
791 | std::unique_ptr<InstrProfValueData[]> ValueData = |
792 | std::make_unique<InstrProfValueData[]>(num: MaxNumPromotions); |
793 | bool Valid = |
794 | getValueProfDataFromInst(Inst, ValueKind: IPVK_IndirectCallTarget, MaxNumValueData: MaxNumPromotions, |
795 | ValueData: ValueData.get(), ActualNumValueData&: NumVals, TotalC&: TotalCount, GetNoICPValue: true); |
796 | // No valid value profile so no promoted targets have been recorded |
797 | // before. Ok to do ICP. |
798 | if (!Valid) |
799 | return true; |
800 | |
801 | unsigned NumPromoted = 0; |
802 | for (uint32_t I = 0; I < NumVals; I++) { |
803 | if (ValueData[I].Count != NOMORE_ICP_MAGICNUM) |
804 | continue; |
805 | |
806 | // If the promotion candidate has NOMORE_ICP_MAGICNUM count in the |
807 | // metadata, it means the candidate has been promoted for this |
808 | // indirect call. |
809 | if (ValueData[I].Value == Function::getGUID(GlobalName: Candidate)) |
810 | return false; |
811 | NumPromoted++; |
812 | // If already have MaxNumPromotions promotion, don't do it anymore. |
813 | if (NumPromoted == MaxNumPromotions) |
814 | return false; |
815 | } |
816 | return true; |
817 | } |
818 | |
819 | /// Update indirect call target profile metadata for \p Inst. |
820 | /// Usually \p Sum is the sum of counts of all the targets for \p Inst. |
821 | /// If it is 0, it means updateIDTMetaData is used to mark a |
822 | /// certain target to be promoted already. If it is not zero, |
823 | /// we expect to use it to update the total count in the value profile. |
824 | static void |
825 | updateIDTMetaData(Instruction &Inst, |
826 | const SmallVectorImpl<InstrProfValueData> &CallTargets, |
827 | uint64_t Sum) { |
828 | // Bail out early if MaxNumPromotions is zero. |
829 | // This prevents allocating an array of zero length below. |
830 | // |
831 | // Note `updateIDTMetaData` is called in two places so check |
832 | // `MaxNumPromotions` inside it. |
833 | if (MaxNumPromotions == 0) |
834 | return; |
835 | uint32_t NumVals = 0; |
836 | // OldSum is the existing total count in the value profile data. |
837 | uint64_t OldSum = 0; |
838 | std::unique_ptr<InstrProfValueData[]> ValueData = |
839 | std::make_unique<InstrProfValueData[]>(num: MaxNumPromotions); |
840 | bool Valid = |
841 | getValueProfDataFromInst(Inst, ValueKind: IPVK_IndirectCallTarget, MaxNumValueData: MaxNumPromotions, |
842 | ValueData: ValueData.get(), ActualNumValueData&: NumVals, TotalC&: OldSum, GetNoICPValue: true); |
843 | |
844 | DenseMap<uint64_t, uint64_t> ValueCountMap; |
845 | if (Sum == 0) { |
846 | assert((CallTargets.size() == 1 && |
847 | CallTargets[0].Count == NOMORE_ICP_MAGICNUM) && |
848 | "If sum is 0, assume only one element in CallTargets " |
849 | "with count being NOMORE_ICP_MAGICNUM" ); |
850 | // Initialize ValueCountMap with existing value profile data. |
851 | if (Valid) { |
852 | for (uint32_t I = 0; I < NumVals; I++) |
853 | ValueCountMap[ValueData[I].Value] = ValueData[I].Count; |
854 | } |
855 | auto Pair = |
856 | ValueCountMap.try_emplace(Key: CallTargets[0].Value, Args: CallTargets[0].Count); |
857 | // If the target already exists in value profile, decrease the total |
858 | // count OldSum and reset the target's count to NOMORE_ICP_MAGICNUM. |
859 | if (!Pair.second) { |
860 | OldSum -= Pair.first->second; |
861 | Pair.first->second = NOMORE_ICP_MAGICNUM; |
862 | } |
863 | Sum = OldSum; |
864 | } else { |
865 | // Initialize ValueCountMap with existing NOMORE_ICP_MAGICNUM |
866 | // counts in the value profile. |
867 | if (Valid) { |
868 | for (uint32_t I = 0; I < NumVals; I++) { |
869 | if (ValueData[I].Count == NOMORE_ICP_MAGICNUM) |
870 | ValueCountMap[ValueData[I].Value] = ValueData[I].Count; |
871 | } |
872 | } |
873 | |
874 | for (const auto &Data : CallTargets) { |
875 | auto Pair = ValueCountMap.try_emplace(Key: Data.Value, Args: Data.Count); |
876 | if (Pair.second) |
877 | continue; |
878 | // The target represented by Data.Value has already been promoted. |
879 | // Keep the count as NOMORE_ICP_MAGICNUM in the profile and decrease |
880 | // Sum by Data.Count. |
881 | assert(Sum >= Data.Count && "Sum should never be less than Data.Count" ); |
882 | Sum -= Data.Count; |
883 | } |
884 | } |
885 | |
886 | SmallVector<InstrProfValueData, 8> NewCallTargets; |
887 | for (const auto &ValueCount : ValueCountMap) { |
888 | NewCallTargets.emplace_back( |
889 | Args: InstrProfValueData{.Value: ValueCount.first, .Count: ValueCount.second}); |
890 | } |
891 | |
892 | llvm::sort(C&: NewCallTargets, |
893 | Comp: [](const InstrProfValueData &L, const InstrProfValueData &R) { |
894 | if (L.Count != R.Count) |
895 | return L.Count > R.Count; |
896 | return L.Value > R.Value; |
897 | }); |
898 | |
899 | uint32_t MaxMDCount = |
900 | std::min(a: NewCallTargets.size(), b: static_cast<size_t>(MaxNumPromotions)); |
901 | annotateValueSite(M&: *Inst.getParent()->getParent()->getParent(), Inst, |
902 | VDs: NewCallTargets, Sum, ValueKind: IPVK_IndirectCallTarget, MaxMDCount); |
903 | } |
904 | |
905 | /// Attempt to promote indirect call and also inline the promoted call. |
906 | /// |
907 | /// \param F Caller function. |
908 | /// \param Candidate ICP and inline candidate. |
909 | /// \param SumOrigin Original sum of target counts for indirect call before |
910 | /// promoting given candidate. |
911 | /// \param Sum Prorated sum of remaining target counts for indirect call |
912 | /// after promoting given candidate. |
913 | /// \param InlinedCallSite Output vector for new call sites exposed after |
914 | /// inlining. |
915 | bool SampleProfileLoader::tryPromoteAndInlineCandidate( |
916 | Function &F, InlineCandidate &Candidate, uint64_t SumOrigin, uint64_t &Sum, |
917 | SmallVector<CallBase *, 8> *InlinedCallSite) { |
918 | // Bail out early if sample-loader inliner is disabled. |
919 | if (DisableSampleLoaderInlining) |
920 | return false; |
921 | |
922 | // Bail out early if MaxNumPromotions is zero. |
923 | // This prevents allocating an array of zero length in callees below. |
924 | if (MaxNumPromotions == 0) |
925 | return false; |
926 | auto CalleeFunctionName = Candidate.CalleeSamples->getFunction(); |
927 | auto R = SymbolMap.find(Key: CalleeFunctionName); |
928 | if (R == SymbolMap.end() || !R->second) |
929 | return false; |
930 | |
931 | auto &CI = *Candidate.CallInstr; |
932 | if (!doesHistoryAllowICP(Inst: CI, Candidate: R->second->getName())) |
933 | return false; |
934 | |
935 | const char *Reason = "Callee function not available" ; |
936 | // R->getValue() != &F is to prevent promoting a recursive call. |
937 | // If it is a recursive call, we do not inline it as it could bloat |
938 | // the code exponentially. There is way to better handle this, e.g. |
939 | // clone the caller first, and inline the cloned caller if it is |
940 | // recursive. As llvm does not inline recursive calls, we will |
941 | // simply ignore it instead of handling it explicitly. |
942 | if (!R->second->isDeclaration() && R->second->getSubprogram() && |
943 | R->second->hasFnAttribute(Kind: "use-sample-profile" ) && |
944 | R->second != &F && isLegalToPromote(CB: CI, Callee: R->second, FailureReason: &Reason)) { |
945 | // For promoted target, set its value with NOMORE_ICP_MAGICNUM count |
946 | // in the value profile metadata so the target won't be promoted again. |
947 | SmallVector<InstrProfValueData, 1> SortedCallTargets = {InstrProfValueData{ |
948 | .Value: Function::getGUID(GlobalName: R->second->getName()), .Count: NOMORE_ICP_MAGICNUM}}; |
949 | updateIDTMetaData(Inst&: CI, CallTargets: SortedCallTargets, Sum: 0); |
950 | |
951 | auto *DI = &pgo::promoteIndirectCall( |
952 | CB&: CI, F: R->second, Count: Candidate.CallsiteCount, TotalCount: Sum, AttachProfToDirectCall: false, ORE); |
953 | if (DI) { |
954 | Sum -= Candidate.CallsiteCount; |
955 | // Do not prorate the indirect callsite distribution since the original |
956 | // distribution will be used to scale down non-promoted profile target |
957 | // counts later. By doing this we lose track of the real callsite count |
958 | // for the leftover indirect callsite as a trade off for accurate call |
959 | // target counts. |
960 | // TODO: Ideally we would have two separate factors, one for call site |
961 | // counts and one is used to prorate call target counts. |
962 | // Do not update the promoted direct callsite distribution at this |
963 | // point since the original distribution combined with the callee profile |
964 | // will be used to prorate callsites from the callee if inlined. Once not |
965 | // inlined, the direct callsite distribution should be prorated so that |
966 | // the it will reflect the real callsite counts. |
967 | Candidate.CallInstr = DI; |
968 | if (isa<CallInst>(Val: DI) || isa<InvokeInst>(Val: DI)) { |
969 | bool Inlined = tryInlineCandidate(Candidate, InlinedCallSites: InlinedCallSite); |
970 | if (!Inlined) { |
971 | // Prorate the direct callsite distribution so that it reflects real |
972 | // callsite counts. |
973 | setProbeDistributionFactor( |
974 | Inst&: *DI, Factor: static_cast<float>(Candidate.CallsiteCount) / SumOrigin); |
975 | } |
976 | return Inlined; |
977 | } |
978 | } |
979 | } else { |
980 | LLVM_DEBUG(dbgs() << "\nFailed to promote indirect call to " |
981 | << FunctionSamples::getCanonicalFnName( |
982 | Candidate.CallInstr->getName())<< " because " |
983 | << Reason << "\n" ); |
984 | } |
985 | return false; |
986 | } |
987 | |
988 | bool SampleProfileLoader::shouldInlineColdCallee(CallBase &CallInst) { |
989 | if (!ProfileSizeInline) |
990 | return false; |
991 | |
992 | Function *Callee = CallInst.getCalledFunction(); |
993 | if (Callee == nullptr) |
994 | return false; |
995 | |
996 | InlineCost Cost = getInlineCost(Call&: CallInst, Params: getInlineParams(), CalleeTTI&: GetTTI(*Callee), |
997 | GetAssumptionCache: GetAC, GetTLI); |
998 | |
999 | if (Cost.isNever()) |
1000 | return false; |
1001 | |
1002 | if (Cost.isAlways()) |
1003 | return true; |
1004 | |
1005 | return Cost.getCost() <= SampleColdCallSiteThreshold; |
1006 | } |
1007 | |
1008 | void SampleProfileLoader::emitOptimizationRemarksForInlineCandidates( |
1009 | const SmallVectorImpl<CallBase *> &Candidates, const Function &F, |
1010 | bool Hot) { |
1011 | for (auto *I : Candidates) { |
1012 | Function *CalledFunction = I->getCalledFunction(); |
1013 | if (CalledFunction) { |
1014 | ORE->emit(OptDiag&: OptimizationRemarkAnalysis(getAnnotatedRemarkPassName(), |
1015 | "InlineAttempt" , I->getDebugLoc(), |
1016 | I->getParent()) |
1017 | << "previous inlining reattempted for " |
1018 | << (Hot ? "hotness: '" : "size: '" ) |
1019 | << ore::NV("Callee" , CalledFunction) << "' into '" |
1020 | << ore::NV("Caller" , &F) << "'" ); |
1021 | } |
1022 | } |
1023 | } |
1024 | |
1025 | void SampleProfileLoader::findExternalInlineCandidate( |
1026 | CallBase *CB, const FunctionSamples *Samples, |
1027 | DenseSet<GlobalValue::GUID> &InlinedGUIDs, uint64_t Threshold) { |
1028 | |
1029 | // If ExternalInlineAdvisor(ReplayInlineAdvisor) wants to inline an external |
1030 | // function make sure it's imported |
1031 | if (CB && getExternalInlineAdvisorShouldInline(CB&: *CB)) { |
1032 | // Samples may not exist for replayed function, if so |
1033 | // just add the direct GUID and move on |
1034 | if (!Samples) { |
1035 | InlinedGUIDs.insert( |
1036 | V: Function::getGUID(GlobalName: CB->getCalledFunction()->getName())); |
1037 | return; |
1038 | } |
1039 | // Otherwise, drop the threshold to import everything that we can |
1040 | Threshold = 0; |
1041 | } |
1042 | |
1043 | // In some rare cases, call instruction could be changed after being pushed |
1044 | // into inline candidate queue, this is because earlier inlining may expose |
1045 | // constant propagation which can change indirect call to direct call. When |
1046 | // this happens, we may fail to find matching function samples for the |
1047 | // candidate later, even if a match was found when the candidate was enqueued. |
1048 | if (!Samples) |
1049 | return; |
1050 | |
1051 | // For AutoFDO profile, retrieve candidate profiles by walking over |
1052 | // the nested inlinee profiles. |
1053 | if (!FunctionSamples::ProfileIsCS) { |
1054 | // Set threshold to zero to honor pre-inliner decision. |
1055 | if (UsePreInlinerDecision) |
1056 | Threshold = 0; |
1057 | Samples->findInlinedFunctions(S&: InlinedGUIDs, SymbolMap, Threshold); |
1058 | return; |
1059 | } |
1060 | |
1061 | ContextTrieNode *Caller = ContextTracker->getContextNodeForProfile(FSamples: Samples); |
1062 | std::queue<ContextTrieNode *> CalleeList; |
1063 | CalleeList.push(x: Caller); |
1064 | while (!CalleeList.empty()) { |
1065 | ContextTrieNode *Node = CalleeList.front(); |
1066 | CalleeList.pop(); |
1067 | FunctionSamples *CalleeSample = Node->getFunctionSamples(); |
1068 | // For CSSPGO profile, retrieve candidate profile by walking over the |
1069 | // trie built for context profile. Note that also take call targets |
1070 | // even if callee doesn't have a corresponding context profile. |
1071 | if (!CalleeSample) |
1072 | continue; |
1073 | |
1074 | // If pre-inliner decision is used, honor that for importing as well. |
1075 | bool PreInline = |
1076 | UsePreInlinerDecision && |
1077 | CalleeSample->getContext().hasAttribute(A: ContextShouldBeInlined); |
1078 | if (!PreInline && CalleeSample->getHeadSamplesEstimate() < Threshold) |
1079 | continue; |
1080 | |
1081 | Function *Func = SymbolMap.lookup(Key: CalleeSample->getFunction()); |
1082 | // Add to the import list only when it's defined out of module. |
1083 | if (!Func || Func->isDeclaration()) |
1084 | InlinedGUIDs.insert(V: CalleeSample->getGUID()); |
1085 | |
1086 | // Import hot CallTargets, which may not be available in IR because full |
1087 | // profile annotation cannot be done until backend compilation in ThinLTO. |
1088 | for (const auto &BS : CalleeSample->getBodySamples()) |
1089 | for (const auto &TS : BS.second.getCallTargets()) |
1090 | if (TS.second > Threshold) { |
1091 | const Function *Callee = SymbolMap.lookup(Key: TS.first); |
1092 | if (!Callee || Callee->isDeclaration()) |
1093 | InlinedGUIDs.insert(V: TS.first.getHashCode()); |
1094 | } |
1095 | |
1096 | // Import hot child context profile associted with callees. Note that this |
1097 | // may have some overlap with the call target loop above, but doing this |
1098 | // based child context profile again effectively allow us to use the max of |
1099 | // entry count and call target count to determine importing. |
1100 | for (auto &Child : Node->getAllChildContext()) { |
1101 | ContextTrieNode *CalleeNode = &Child.second; |
1102 | CalleeList.push(x: CalleeNode); |
1103 | } |
1104 | } |
1105 | } |
1106 | |
1107 | /// Iteratively inline hot callsites of a function. |
1108 | /// |
1109 | /// Iteratively traverse all callsites of the function \p F, so as to |
1110 | /// find out callsites with corresponding inline instances. |
1111 | /// |
1112 | /// For such callsites, |
1113 | /// - If it is hot enough, inline the callsites and adds callsites of the callee |
1114 | /// into the caller. If the call is an indirect call, first promote |
1115 | /// it to direct call. Each indirect call is limited with a single target. |
1116 | /// |
1117 | /// - If a callsite is not inlined, merge the its profile to the outline |
1118 | /// version (if --sample-profile-merge-inlinee is true), or scale the |
1119 | /// counters of standalone function based on the profile of inlined |
1120 | /// instances (if --sample-profile-merge-inlinee is false). |
1121 | /// |
1122 | /// Later passes may consume the updated profiles. |
1123 | /// |
1124 | /// \param F function to perform iterative inlining. |
1125 | /// \param InlinedGUIDs a set to be updated to include all GUIDs that are |
1126 | /// inlined in the profiled binary. |
1127 | /// |
1128 | /// \returns True if there is any inline happened. |
1129 | bool SampleProfileLoader::inlineHotFunctions( |
1130 | Function &F, DenseSet<GlobalValue::GUID> &InlinedGUIDs) { |
1131 | // ProfAccForSymsInList is used in callsiteIsHot. The assertion makes sure |
1132 | // Profile symbol list is ignored when profile-sample-accurate is on. |
1133 | assert((!ProfAccForSymsInList || |
1134 | (!ProfileSampleAccurate && |
1135 | !F.hasFnAttribute("profile-sample-accurate" ))) && |
1136 | "ProfAccForSymsInList should be false when profile-sample-accurate " |
1137 | "is enabled" ); |
1138 | |
1139 | MapVector<CallBase *, const FunctionSamples *> LocalNotInlinedCallSites; |
1140 | bool Changed = false; |
1141 | bool LocalChanged = true; |
1142 | while (LocalChanged) { |
1143 | LocalChanged = false; |
1144 | SmallVector<CallBase *, 10> CIS; |
1145 | for (auto &BB : F) { |
1146 | bool Hot = false; |
1147 | SmallVector<CallBase *, 10> AllCandidates; |
1148 | SmallVector<CallBase *, 10> ColdCandidates; |
1149 | for (auto &I : BB) { |
1150 | const FunctionSamples *FS = nullptr; |
1151 | if (auto *CB = dyn_cast<CallBase>(Val: &I)) { |
1152 | if (!isa<IntrinsicInst>(Val: I)) { |
1153 | if ((FS = findCalleeFunctionSamples(Inst: *CB))) { |
1154 | assert((!FunctionSamples::UseMD5 || FS->GUIDToFuncNameMap) && |
1155 | "GUIDToFuncNameMap has to be populated" ); |
1156 | AllCandidates.push_back(Elt: CB); |
1157 | if (FS->getHeadSamplesEstimate() > 0 || |
1158 | FunctionSamples::ProfileIsCS) |
1159 | LocalNotInlinedCallSites.insert(KV: {CB, FS}); |
1160 | if (callsiteIsHot(CallsiteFS: FS, PSI, ProfAccForSymsInList)) |
1161 | Hot = true; |
1162 | else if (shouldInlineColdCallee(CallInst&: *CB)) |
1163 | ColdCandidates.push_back(Elt: CB); |
1164 | } else if (getExternalInlineAdvisorShouldInline(CB&: *CB)) { |
1165 | AllCandidates.push_back(Elt: CB); |
1166 | } |
1167 | } |
1168 | } |
1169 | } |
1170 | if (Hot || ExternalInlineAdvisor) { |
1171 | CIS.insert(I: CIS.begin(), From: AllCandidates.begin(), To: AllCandidates.end()); |
1172 | emitOptimizationRemarksForInlineCandidates(Candidates: AllCandidates, F, Hot: true); |
1173 | } else { |
1174 | CIS.insert(I: CIS.begin(), From: ColdCandidates.begin(), To: ColdCandidates.end()); |
1175 | emitOptimizationRemarksForInlineCandidates(Candidates: ColdCandidates, F, Hot: false); |
1176 | } |
1177 | } |
1178 | for (CallBase *I : CIS) { |
1179 | Function *CalledFunction = I->getCalledFunction(); |
1180 | InlineCandidate Candidate = {.CallInstr: I, .CalleeSamples: LocalNotInlinedCallSites.lookup(Key: I), |
1181 | .CallsiteCount: 0 /* dummy count */, |
1182 | .CallsiteDistribution: 1.0 /* dummy distribution factor */}; |
1183 | // Do not inline recursive calls. |
1184 | if (CalledFunction == &F) |
1185 | continue; |
1186 | if (I->isIndirectCall()) { |
1187 | uint64_t Sum; |
1188 | for (const auto *FS : findIndirectCallFunctionSamples(Inst: *I, Sum)) { |
1189 | uint64_t SumOrigin = Sum; |
1190 | if (LTOPhase == ThinOrFullLTOPhase::ThinLTOPreLink) { |
1191 | findExternalInlineCandidate(CB: I, Samples: FS, InlinedGUIDs, |
1192 | Threshold: PSI->getOrCompHotCountThreshold()); |
1193 | continue; |
1194 | } |
1195 | if (!callsiteIsHot(CallsiteFS: FS, PSI, ProfAccForSymsInList)) |
1196 | continue; |
1197 | |
1198 | Candidate = {.CallInstr: I, .CalleeSamples: FS, .CallsiteCount: FS->getHeadSamplesEstimate(), .CallsiteDistribution: 1.0}; |
1199 | if (tryPromoteAndInlineCandidate(F, Candidate, SumOrigin, Sum)) { |
1200 | LocalNotInlinedCallSites.erase(Key: I); |
1201 | LocalChanged = true; |
1202 | } |
1203 | } |
1204 | } else if (CalledFunction && CalledFunction->getSubprogram() && |
1205 | !CalledFunction->isDeclaration()) { |
1206 | if (tryInlineCandidate(Candidate)) { |
1207 | LocalNotInlinedCallSites.erase(Key: I); |
1208 | LocalChanged = true; |
1209 | } |
1210 | } else if (LTOPhase == ThinOrFullLTOPhase::ThinLTOPreLink) { |
1211 | findExternalInlineCandidate(CB: I, Samples: findCalleeFunctionSamples(Inst: *I), |
1212 | InlinedGUIDs, |
1213 | Threshold: PSI->getOrCompHotCountThreshold()); |
1214 | } |
1215 | } |
1216 | Changed |= LocalChanged; |
1217 | } |
1218 | |
1219 | // For CS profile, profile for not inlined context will be merged when |
1220 | // base profile is being retrieved. |
1221 | if (!FunctionSamples::ProfileIsCS) |
1222 | promoteMergeNotInlinedContextSamples(NonInlinedCallSites: LocalNotInlinedCallSites, F); |
1223 | return Changed; |
1224 | } |
1225 | |
1226 | bool SampleProfileLoader::tryInlineCandidate( |
1227 | InlineCandidate &Candidate, SmallVector<CallBase *, 8> *InlinedCallSites) { |
1228 | // Do not attempt to inline a candidate if |
1229 | // --disable-sample-loader-inlining is true. |
1230 | if (DisableSampleLoaderInlining) |
1231 | return false; |
1232 | |
1233 | CallBase &CB = *Candidate.CallInstr; |
1234 | Function *CalledFunction = CB.getCalledFunction(); |
1235 | assert(CalledFunction && "Expect a callee with definition" ); |
1236 | DebugLoc DLoc = CB.getDebugLoc(); |
1237 | BasicBlock *BB = CB.getParent(); |
1238 | |
1239 | InlineCost Cost = shouldInlineCandidate(Candidate); |
1240 | if (Cost.isNever()) { |
1241 | ORE->emit(OptDiag&: OptimizationRemarkAnalysis(getAnnotatedRemarkPassName(), |
1242 | "InlineFail" , DLoc, BB) |
1243 | << "incompatible inlining" ); |
1244 | return false; |
1245 | } |
1246 | |
1247 | if (!Cost) |
1248 | return false; |
1249 | |
1250 | InlineFunctionInfo IFI(GetAC); |
1251 | IFI.UpdateProfile = false; |
1252 | InlineResult IR = InlineFunction(CB, IFI, |
1253 | /*MergeAttributes=*/true); |
1254 | if (!IR.isSuccess()) |
1255 | return false; |
1256 | |
1257 | // The call to InlineFunction erases I, so we can't pass it here. |
1258 | emitInlinedIntoBasedOnCost(ORE&: *ORE, DLoc, Block: BB, Callee: *CalledFunction, Caller: *BB->getParent(), |
1259 | IC: Cost, ForProfileContext: true, PassName: getAnnotatedRemarkPassName()); |
1260 | |
1261 | // Now populate the list of newly exposed call sites. |
1262 | if (InlinedCallSites) { |
1263 | InlinedCallSites->clear(); |
1264 | for (auto &I : IFI.InlinedCallSites) |
1265 | InlinedCallSites->push_back(Elt: I); |
1266 | } |
1267 | |
1268 | if (FunctionSamples::ProfileIsCS) |
1269 | ContextTracker->markContextSamplesInlined(InlinedSamples: Candidate.CalleeSamples); |
1270 | ++NumCSInlined; |
1271 | |
1272 | // Prorate inlined probes for a duplicated inlining callsite which probably |
1273 | // has a distribution less than 100%. Samples for an inlinee should be |
1274 | // distributed among the copies of the original callsite based on each |
1275 | // callsite's distribution factor for counts accuracy. Note that an inlined |
1276 | // probe may come with its own distribution factor if it has been duplicated |
1277 | // in the inlinee body. The two factor are multiplied to reflect the |
1278 | // aggregation of duplication. |
1279 | if (Candidate.CallsiteDistribution < 1) { |
1280 | for (auto &I : IFI.InlinedCallSites) { |
1281 | if (std::optional<PseudoProbe> Probe = extractProbe(Inst: *I)) |
1282 | setProbeDistributionFactor(Inst&: *I, Factor: Probe->Factor * |
1283 | Candidate.CallsiteDistribution); |
1284 | } |
1285 | NumDuplicatedInlinesite++; |
1286 | } |
1287 | |
1288 | return true; |
1289 | } |
1290 | |
1291 | bool SampleProfileLoader::getInlineCandidate(InlineCandidate *NewCandidate, |
1292 | CallBase *CB) { |
1293 | assert(CB && "Expect non-null call instruction" ); |
1294 | |
1295 | if (isa<IntrinsicInst>(Val: CB)) |
1296 | return false; |
1297 | |
1298 | // Find the callee's profile. For indirect call, find hottest target profile. |
1299 | const FunctionSamples *CalleeSamples = findCalleeFunctionSamples(Inst: *CB); |
1300 | // If ExternalInlineAdvisor wants to inline this site, do so even |
1301 | // if Samples are not present. |
1302 | if (!CalleeSamples && !getExternalInlineAdvisorShouldInline(CB&: *CB)) |
1303 | return false; |
1304 | |
1305 | float Factor = 1.0; |
1306 | if (std::optional<PseudoProbe> Probe = extractProbe(Inst: *CB)) |
1307 | Factor = Probe->Factor; |
1308 | |
1309 | uint64_t CallsiteCount = |
1310 | CalleeSamples ? CalleeSamples->getHeadSamplesEstimate() * Factor : 0; |
1311 | *NewCandidate = {.CallInstr: CB, .CalleeSamples: CalleeSamples, .CallsiteCount: CallsiteCount, .CallsiteDistribution: Factor}; |
1312 | return true; |
1313 | } |
1314 | |
1315 | std::optional<InlineCost> |
1316 | SampleProfileLoader::getExternalInlineAdvisorCost(CallBase &CB) { |
1317 | std::unique_ptr<InlineAdvice> Advice = nullptr; |
1318 | if (ExternalInlineAdvisor) { |
1319 | Advice = ExternalInlineAdvisor->getAdvice(CB); |
1320 | if (Advice) { |
1321 | if (!Advice->isInliningRecommended()) { |
1322 | Advice->recordUnattemptedInlining(); |
1323 | return InlineCost::getNever(Reason: "not previously inlined" ); |
1324 | } |
1325 | Advice->recordInlining(); |
1326 | return InlineCost::getAlways(Reason: "previously inlined" ); |
1327 | } |
1328 | } |
1329 | |
1330 | return {}; |
1331 | } |
1332 | |
1333 | bool SampleProfileLoader::getExternalInlineAdvisorShouldInline(CallBase &CB) { |
1334 | std::optional<InlineCost> Cost = getExternalInlineAdvisorCost(CB); |
1335 | return Cost ? !!*Cost : false; |
1336 | } |
1337 | |
1338 | InlineCost |
1339 | SampleProfileLoader::shouldInlineCandidate(InlineCandidate &Candidate) { |
1340 | if (std::optional<InlineCost> ReplayCost = |
1341 | getExternalInlineAdvisorCost(CB&: *Candidate.CallInstr)) |
1342 | return *ReplayCost; |
1343 | // Adjust threshold based on call site hotness, only do this for callsite |
1344 | // prioritized inliner because otherwise cost-benefit check is done earlier. |
1345 | int SampleThreshold = SampleColdCallSiteThreshold; |
1346 | if (CallsitePrioritizedInline) { |
1347 | if (Candidate.CallsiteCount > PSI->getHotCountThreshold()) |
1348 | SampleThreshold = SampleHotCallSiteThreshold; |
1349 | else if (!ProfileSizeInline) |
1350 | return InlineCost::getNever(Reason: "cold callsite" ); |
1351 | } |
1352 | |
1353 | Function *Callee = Candidate.CallInstr->getCalledFunction(); |
1354 | assert(Callee && "Expect a definition for inline candidate of direct call" ); |
1355 | |
1356 | InlineParams Params = getInlineParams(); |
1357 | // We will ignore the threshold from inline cost, so always get full cost. |
1358 | Params.ComputeFullInlineCost = true; |
1359 | Params.AllowRecursiveCall = AllowRecursiveInline; |
1360 | // Checks if there is anything in the reachable portion of the callee at |
1361 | // this callsite that makes this inlining potentially illegal. Need to |
1362 | // set ComputeFullInlineCost, otherwise getInlineCost may return early |
1363 | // when cost exceeds threshold without checking all IRs in the callee. |
1364 | // The acutal cost does not matter because we only checks isNever() to |
1365 | // see if it is legal to inline the callsite. |
1366 | InlineCost Cost = getInlineCost(Call&: *Candidate.CallInstr, Callee, Params, |
1367 | CalleeTTI&: GetTTI(*Callee), GetAssumptionCache: GetAC, GetTLI); |
1368 | |
1369 | // Honor always inline and never inline from call analyzer |
1370 | if (Cost.isNever() || Cost.isAlways()) |
1371 | return Cost; |
1372 | |
1373 | // With CSSPGO, the preinliner in llvm-profgen can estimate global inline |
1374 | // decisions based on hotness as well as accurate function byte sizes for |
1375 | // given context using function/inlinee sizes from previous build. It |
1376 | // stores the decision in profile, and also adjust/merge context profile |
1377 | // aiming at better context-sensitive post-inline profile quality, assuming |
1378 | // all inline decision estimates are going to be honored by compiler. Here |
1379 | // we replay that inline decision under `sample-profile-use-preinliner`. |
1380 | // Note that we don't need to handle negative decision from preinliner as |
1381 | // context profile for not inlined calls are merged by preinliner already. |
1382 | if (UsePreInlinerDecision && Candidate.CalleeSamples) { |
1383 | // Once two node are merged due to promotion, we're losing some context |
1384 | // so the original context-sensitive preinliner decision should be ignored |
1385 | // for SyntheticContext. |
1386 | SampleContext &Context = Candidate.CalleeSamples->getContext(); |
1387 | if (!Context.hasState(S: SyntheticContext) && |
1388 | Context.hasAttribute(A: ContextShouldBeInlined)) |
1389 | return InlineCost::getAlways(Reason: "preinliner" ); |
1390 | } |
1391 | |
1392 | // For old FDO inliner, we inline the call site as long as cost is not |
1393 | // "Never". The cost-benefit check is done earlier. |
1394 | if (!CallsitePrioritizedInline) { |
1395 | return InlineCost::get(Cost: Cost.getCost(), INT_MAX); |
1396 | } |
1397 | |
1398 | // Otherwise only use the cost from call analyzer, but overwite threshold with |
1399 | // Sample PGO threshold. |
1400 | return InlineCost::get(Cost: Cost.getCost(), Threshold: SampleThreshold); |
1401 | } |
1402 | |
1403 | bool SampleProfileLoader::inlineHotFunctionsWithPriority( |
1404 | Function &F, DenseSet<GlobalValue::GUID> &InlinedGUIDs) { |
1405 | // ProfAccForSymsInList is used in callsiteIsHot. The assertion makes sure |
1406 | // Profile symbol list is ignored when profile-sample-accurate is on. |
1407 | assert((!ProfAccForSymsInList || |
1408 | (!ProfileSampleAccurate && |
1409 | !F.hasFnAttribute("profile-sample-accurate" ))) && |
1410 | "ProfAccForSymsInList should be false when profile-sample-accurate " |
1411 | "is enabled" ); |
1412 | |
1413 | // Populating worklist with initial call sites from root inliner, along |
1414 | // with call site weights. |
1415 | CandidateQueue CQueue; |
1416 | InlineCandidate NewCandidate; |
1417 | for (auto &BB : F) { |
1418 | for (auto &I : BB) { |
1419 | auto *CB = dyn_cast<CallBase>(Val: &I); |
1420 | if (!CB) |
1421 | continue; |
1422 | if (getInlineCandidate(NewCandidate: &NewCandidate, CB)) |
1423 | CQueue.push(x: NewCandidate); |
1424 | } |
1425 | } |
1426 | |
1427 | // Cap the size growth from profile guided inlining. This is needed even |
1428 | // though cost of each inline candidate already accounts for callee size, |
1429 | // because with top-down inlining, we can grow inliner size significantly |
1430 | // with large number of smaller inlinees each pass the cost check. |
1431 | assert(ProfileInlineLimitMax >= ProfileInlineLimitMin && |
1432 | "Max inline size limit should not be smaller than min inline size " |
1433 | "limit." ); |
1434 | unsigned SizeLimit = F.getInstructionCount() * ProfileInlineGrowthLimit; |
1435 | SizeLimit = std::min(a: SizeLimit, b: (unsigned)ProfileInlineLimitMax); |
1436 | SizeLimit = std::max(a: SizeLimit, b: (unsigned)ProfileInlineLimitMin); |
1437 | if (ExternalInlineAdvisor) |
1438 | SizeLimit = std::numeric_limits<unsigned>::max(); |
1439 | |
1440 | MapVector<CallBase *, const FunctionSamples *> LocalNotInlinedCallSites; |
1441 | |
1442 | // Perform iterative BFS call site prioritized inlining |
1443 | bool Changed = false; |
1444 | while (!CQueue.empty() && F.getInstructionCount() < SizeLimit) { |
1445 | InlineCandidate Candidate = CQueue.top(); |
1446 | CQueue.pop(); |
1447 | CallBase *I = Candidate.CallInstr; |
1448 | Function *CalledFunction = I->getCalledFunction(); |
1449 | |
1450 | if (CalledFunction == &F) |
1451 | continue; |
1452 | if (I->isIndirectCall()) { |
1453 | uint64_t Sum = 0; |
1454 | auto CalleeSamples = findIndirectCallFunctionSamples(Inst: *I, Sum); |
1455 | uint64_t SumOrigin = Sum; |
1456 | Sum *= Candidate.CallsiteDistribution; |
1457 | unsigned ICPCount = 0; |
1458 | for (const auto *FS : CalleeSamples) { |
1459 | // TODO: Consider disable pre-lTO ICP for MonoLTO as well |
1460 | if (LTOPhase == ThinOrFullLTOPhase::ThinLTOPreLink) { |
1461 | findExternalInlineCandidate(CB: I, Samples: FS, InlinedGUIDs, |
1462 | Threshold: PSI->getOrCompHotCountThreshold()); |
1463 | continue; |
1464 | } |
1465 | uint64_t EntryCountDistributed = |
1466 | FS->getHeadSamplesEstimate() * Candidate.CallsiteDistribution; |
1467 | // In addition to regular inline cost check, we also need to make sure |
1468 | // ICP isn't introducing excessive speculative checks even if individual |
1469 | // target looks beneficial to promote and inline. That means we should |
1470 | // only do ICP when there's a small number dominant targets. |
1471 | if (ICPCount >= ProfileICPRelativeHotnessSkip && |
1472 | EntryCountDistributed * 100 < SumOrigin * ProfileICPRelativeHotness) |
1473 | break; |
1474 | // TODO: Fix CallAnalyzer to handle all indirect calls. |
1475 | // For indirect call, we don't run CallAnalyzer to get InlineCost |
1476 | // before actual inlining. This is because we could see two different |
1477 | // types from the same definition, which makes CallAnalyzer choke as |
1478 | // it's expecting matching parameter type on both caller and callee |
1479 | // side. See example from PR18962 for the triggering cases (the bug was |
1480 | // fixed, but we generate different types). |
1481 | if (!PSI->isHotCount(C: EntryCountDistributed)) |
1482 | break; |
1483 | SmallVector<CallBase *, 8> InlinedCallSites; |
1484 | // Attach function profile for promoted indirect callee, and update |
1485 | // call site count for the promoted inline candidate too. |
1486 | Candidate = {.CallInstr: I, .CalleeSamples: FS, .CallsiteCount: EntryCountDistributed, |
1487 | .CallsiteDistribution: Candidate.CallsiteDistribution}; |
1488 | if (tryPromoteAndInlineCandidate(F, Candidate, SumOrigin, Sum, |
1489 | InlinedCallSite: &InlinedCallSites)) { |
1490 | for (auto *CB : InlinedCallSites) { |
1491 | if (getInlineCandidate(NewCandidate: &NewCandidate, CB)) |
1492 | CQueue.emplace(args&: NewCandidate); |
1493 | } |
1494 | ICPCount++; |
1495 | Changed = true; |
1496 | } else if (!ContextTracker) { |
1497 | LocalNotInlinedCallSites.insert(KV: {I, FS}); |
1498 | } |
1499 | } |
1500 | } else if (CalledFunction && CalledFunction->getSubprogram() && |
1501 | !CalledFunction->isDeclaration()) { |
1502 | SmallVector<CallBase *, 8> InlinedCallSites; |
1503 | if (tryInlineCandidate(Candidate, InlinedCallSites: &InlinedCallSites)) { |
1504 | for (auto *CB : InlinedCallSites) { |
1505 | if (getInlineCandidate(NewCandidate: &NewCandidate, CB)) |
1506 | CQueue.emplace(args&: NewCandidate); |
1507 | } |
1508 | Changed = true; |
1509 | } else if (!ContextTracker) { |
1510 | LocalNotInlinedCallSites.insert(KV: {I, Candidate.CalleeSamples}); |
1511 | } |
1512 | } else if (LTOPhase == ThinOrFullLTOPhase::ThinLTOPreLink) { |
1513 | findExternalInlineCandidate(CB: I, Samples: findCalleeFunctionSamples(Inst: *I), |
1514 | InlinedGUIDs, |
1515 | Threshold: PSI->getOrCompHotCountThreshold()); |
1516 | } |
1517 | } |
1518 | |
1519 | if (!CQueue.empty()) { |
1520 | if (SizeLimit == (unsigned)ProfileInlineLimitMax) |
1521 | ++NumCSInlinedHitMaxLimit; |
1522 | else if (SizeLimit == (unsigned)ProfileInlineLimitMin) |
1523 | ++NumCSInlinedHitMinLimit; |
1524 | else |
1525 | ++NumCSInlinedHitGrowthLimit; |
1526 | } |
1527 | |
1528 | // For CS profile, profile for not inlined context will be merged when |
1529 | // base profile is being retrieved. |
1530 | if (!FunctionSamples::ProfileIsCS) |
1531 | promoteMergeNotInlinedContextSamples(NonInlinedCallSites: LocalNotInlinedCallSites, F); |
1532 | return Changed; |
1533 | } |
1534 | |
1535 | void SampleProfileLoader::promoteMergeNotInlinedContextSamples( |
1536 | MapVector<CallBase *, const FunctionSamples *> NonInlinedCallSites, |
1537 | const Function &F) { |
1538 | // Accumulate not inlined callsite information into notInlinedSamples |
1539 | for (const auto &Pair : NonInlinedCallSites) { |
1540 | CallBase *I = Pair.first; |
1541 | Function *Callee = I->getCalledFunction(); |
1542 | if (!Callee || Callee->isDeclaration()) |
1543 | continue; |
1544 | |
1545 | ORE->emit( |
1546 | OptDiag&: OptimizationRemarkAnalysis(getAnnotatedRemarkPassName(), "NotInline" , |
1547 | I->getDebugLoc(), I->getParent()) |
1548 | << "previous inlining not repeated: '" << ore::NV("Callee" , Callee) |
1549 | << "' into '" << ore::NV("Caller" , &F) << "'" ); |
1550 | |
1551 | ++NumCSNotInlined; |
1552 | const FunctionSamples *FS = Pair.second; |
1553 | if (FS->getTotalSamples() == 0 && FS->getHeadSamplesEstimate() == 0) { |
1554 | continue; |
1555 | } |
1556 | |
1557 | // Do not merge a context that is already duplicated into the base profile. |
1558 | if (FS->getContext().hasAttribute(A: sampleprof::ContextDuplicatedIntoBase)) |
1559 | continue; |
1560 | |
1561 | if (ProfileMergeInlinee) { |
1562 | // A function call can be replicated by optimizations like callsite |
1563 | // splitting or jump threading and the replicates end up sharing the |
1564 | // sample nested callee profile instead of slicing the original |
1565 | // inlinee's profile. We want to do merge exactly once by filtering out |
1566 | // callee profiles with a non-zero head sample count. |
1567 | if (FS->getHeadSamples() == 0) { |
1568 | // Use entry samples as head samples during the merge, as inlinees |
1569 | // don't have head samples. |
1570 | const_cast<FunctionSamples *>(FS)->addHeadSamples( |
1571 | Num: FS->getHeadSamplesEstimate()); |
1572 | |
1573 | // Note that we have to do the merge right after processing function. |
1574 | // This allows OutlineFS's profile to be used for annotation during |
1575 | // top-down processing of functions' annotation. |
1576 | FunctionSamples *OutlineFS = Reader->getSamplesFor(F: *Callee); |
1577 | // If outlined function does not exist in the profile, add it to a |
1578 | // separate map so that it does not rehash the original profile. |
1579 | if (!OutlineFS) |
1580 | OutlineFS = &OutlineFunctionSamples[ |
1581 | FunctionId(FunctionSamples::getCanonicalFnName(FnName: Callee->getName()))]; |
1582 | OutlineFS->merge(Other: *FS, Weight: 1); |
1583 | // Set outlined profile to be synthetic to not bias the inliner. |
1584 | OutlineFS->SetContextSynthetic(); |
1585 | } |
1586 | } else { |
1587 | auto pair = |
1588 | notInlinedCallInfo.try_emplace(Key: Callee, Args: NotInlinedProfileInfo{.entryCount: 0}); |
1589 | pair.first->second.entryCount += FS->getHeadSamplesEstimate(); |
1590 | } |
1591 | } |
1592 | } |
1593 | |
1594 | /// Returns the sorted CallTargetMap \p M by count in descending order. |
1595 | static SmallVector<InstrProfValueData, 2> |
1596 | GetSortedValueDataFromCallTargets(const SampleRecord::CallTargetMap &M) { |
1597 | SmallVector<InstrProfValueData, 2> R; |
1598 | for (const auto &I : SampleRecord::SortCallTargets(Targets: M)) { |
1599 | R.emplace_back( |
1600 | Args: InstrProfValueData{.Value: I.first.getHashCode(), .Count: I.second}); |
1601 | } |
1602 | return R; |
1603 | } |
1604 | |
1605 | // Generate MD_prof metadata for every branch instruction using the |
1606 | // edge weights computed during propagation. |
1607 | void SampleProfileLoader::generateMDProfMetadata(Function &F) { |
1608 | // Generate MD_prof metadata for every branch instruction using the |
1609 | // edge weights computed during propagation. |
1610 | LLVM_DEBUG(dbgs() << "\nPropagation complete. Setting branch weights\n" ); |
1611 | LLVMContext &Ctx = F.getContext(); |
1612 | MDBuilder MDB(Ctx); |
1613 | for (auto &BI : F) { |
1614 | BasicBlock *BB = &BI; |
1615 | |
1616 | if (BlockWeights[BB]) { |
1617 | for (auto &I : *BB) { |
1618 | if (!isa<CallInst>(Val: I) && !isa<InvokeInst>(Val: I)) |
1619 | continue; |
1620 | if (!cast<CallBase>(Val&: I).getCalledFunction()) { |
1621 | const DebugLoc &DLoc = I.getDebugLoc(); |
1622 | if (!DLoc) |
1623 | continue; |
1624 | const DILocation *DIL = DLoc; |
1625 | const FunctionSamples *FS = findFunctionSamples(Inst: I); |
1626 | if (!FS) |
1627 | continue; |
1628 | auto CallSite = FunctionSamples::getCallSiteIdentifier(DIL); |
1629 | ErrorOr<SampleRecord::CallTargetMap> T = |
1630 | FS->findCallTargetMapAt(CallSite); |
1631 | if (!T || T.get().empty()) |
1632 | continue; |
1633 | if (FunctionSamples::ProfileIsProbeBased) { |
1634 | // Prorate the callsite counts based on the pre-ICP distribution |
1635 | // factor to reflect what is already done to the callsite before |
1636 | // ICP, such as calliste cloning. |
1637 | if (std::optional<PseudoProbe> Probe = extractProbe(Inst: I)) { |
1638 | if (Probe->Factor < 1) |
1639 | T = SampleRecord::adjustCallTargets(Targets: T.get(), DistributionFactor: Probe->Factor); |
1640 | } |
1641 | } |
1642 | SmallVector<InstrProfValueData, 2> SortedCallTargets = |
1643 | GetSortedValueDataFromCallTargets(M: T.get()); |
1644 | uint64_t Sum = 0; |
1645 | for (const auto &C : T.get()) |
1646 | Sum += C.second; |
1647 | // With CSSPGO all indirect call targets are counted torwards the |
1648 | // original indirect call site in the profile, including both |
1649 | // inlined and non-inlined targets. |
1650 | if (!FunctionSamples::ProfileIsCS) { |
1651 | if (const FunctionSamplesMap *M = |
1652 | FS->findFunctionSamplesMapAt(Loc: CallSite)) { |
1653 | for (const auto &NameFS : *M) |
1654 | Sum += NameFS.second.getHeadSamplesEstimate(); |
1655 | } |
1656 | } |
1657 | if (Sum) |
1658 | updateIDTMetaData(Inst&: I, CallTargets: SortedCallTargets, Sum); |
1659 | else if (OverwriteExistingWeights) |
1660 | I.setMetadata(KindID: LLVMContext::MD_prof, Node: nullptr); |
1661 | } else if (!isa<IntrinsicInst>(Val: &I)) { |
1662 | setBranchWeights(I, Weights: {static_cast<uint32_t>(BlockWeights[BB])}); |
1663 | } |
1664 | } |
1665 | } else if (OverwriteExistingWeights || ProfileSampleBlockAccurate) { |
1666 | // Set profile metadata (possibly annotated by LTO prelink) to zero or |
1667 | // clear it for cold code. |
1668 | for (auto &I : *BB) { |
1669 | if (isa<CallInst>(Val: I) || isa<InvokeInst>(Val: I)) { |
1670 | if (cast<CallBase>(Val&: I).isIndirectCall()) { |
1671 | I.setMetadata(KindID: LLVMContext::MD_prof, Node: nullptr); |
1672 | } else { |
1673 | setBranchWeights(I, Weights: {uint32_t(0)}); |
1674 | } |
1675 | } |
1676 | } |
1677 | } |
1678 | |
1679 | Instruction *TI = BB->getTerminator(); |
1680 | if (TI->getNumSuccessors() == 1) |
1681 | continue; |
1682 | if (!isa<BranchInst>(Val: TI) && !isa<SwitchInst>(Val: TI) && |
1683 | !isa<IndirectBrInst>(Val: TI)) |
1684 | continue; |
1685 | |
1686 | DebugLoc BranchLoc = TI->getDebugLoc(); |
1687 | LLVM_DEBUG(dbgs() << "\nGetting weights for branch at line " |
1688 | << ((BranchLoc) ? Twine(BranchLoc.getLine()) |
1689 | : Twine("<UNKNOWN LOCATION>" )) |
1690 | << ".\n" ); |
1691 | SmallVector<uint32_t, 4> Weights; |
1692 | uint32_t MaxWeight = 0; |
1693 | Instruction *MaxDestInst; |
1694 | // Since profi treats multiple edges (multiway branches) as a single edge, |
1695 | // we need to distribute the computed weight among the branches. We do |
1696 | // this by evenly splitting the edge weight among destinations. |
1697 | DenseMap<const BasicBlock *, uint64_t> EdgeMultiplicity; |
1698 | std::vector<uint64_t> EdgeIndex; |
1699 | if (SampleProfileUseProfi) { |
1700 | EdgeIndex.resize(new_size: TI->getNumSuccessors()); |
1701 | for (unsigned I = 0; I < TI->getNumSuccessors(); ++I) { |
1702 | const BasicBlock *Succ = TI->getSuccessor(Idx: I); |
1703 | EdgeIndex[I] = EdgeMultiplicity[Succ]; |
1704 | EdgeMultiplicity[Succ]++; |
1705 | } |
1706 | } |
1707 | for (unsigned I = 0; I < TI->getNumSuccessors(); ++I) { |
1708 | BasicBlock *Succ = TI->getSuccessor(Idx: I); |
1709 | Edge E = std::make_pair(x&: BB, y&: Succ); |
1710 | uint64_t Weight = EdgeWeights[E]; |
1711 | LLVM_DEBUG(dbgs() << "\t" ; printEdgeWeight(dbgs(), E)); |
1712 | // Use uint32_t saturated arithmetic to adjust the incoming weights, |
1713 | // if needed. Sample counts in profiles are 64-bit unsigned values, |
1714 | // but internally branch weights are expressed as 32-bit values. |
1715 | if (Weight > std::numeric_limits<uint32_t>::max()) { |
1716 | LLVM_DEBUG(dbgs() << " (saturated due to uint32_t overflow)" ); |
1717 | Weight = std::numeric_limits<uint32_t>::max(); |
1718 | } |
1719 | if (!SampleProfileUseProfi) { |
1720 | // Weight is added by one to avoid propagation errors introduced by |
1721 | // 0 weights. |
1722 | Weights.push_back(Elt: static_cast<uint32_t>(Weight + 1)); |
1723 | } else { |
1724 | // Profi creates proper weights that do not require "+1" adjustments but |
1725 | // we evenly split the weight among branches with the same destination. |
1726 | uint64_t W = Weight / EdgeMultiplicity[Succ]; |
1727 | // Rounding up, if needed, so that first branches are hotter. |
1728 | if (EdgeIndex[I] < Weight % EdgeMultiplicity[Succ]) |
1729 | W++; |
1730 | Weights.push_back(Elt: static_cast<uint32_t>(W)); |
1731 | } |
1732 | if (Weight != 0) { |
1733 | if (Weight > MaxWeight) { |
1734 | MaxWeight = Weight; |
1735 | MaxDestInst = Succ->getFirstNonPHIOrDbgOrLifetime(); |
1736 | } |
1737 | } |
1738 | } |
1739 | |
1740 | misexpect::checkExpectAnnotations(I&: *TI, ExistingWeights: Weights, /*IsFrontend=*/false); |
1741 | |
1742 | uint64_t TempWeight; |
1743 | // Only set weights if there is at least one non-zero weight. |
1744 | // In any other case, let the analyzer set weights. |
1745 | // Do not set weights if the weights are present unless under |
1746 | // OverwriteExistingWeights. In ThinLTO, the profile annotation is done |
1747 | // twice. If the first annotation already set the weights, the second pass |
1748 | // does not need to set it. With OverwriteExistingWeights, Blocks with zero |
1749 | // weight should have their existing metadata (possibly annotated by LTO |
1750 | // prelink) cleared. |
1751 | if (MaxWeight > 0 && |
1752 | (!TI->extractProfTotalWeight(TotalVal&: TempWeight) || OverwriteExistingWeights)) { |
1753 | LLVM_DEBUG(dbgs() << "SUCCESS. Found non-zero weights.\n" ); |
1754 | setBranchWeights(I&: *TI, Weights); |
1755 | ORE->emit(RemarkBuilder: [&]() { |
1756 | return OptimizationRemark(DEBUG_TYPE, "PopularDest" , MaxDestInst) |
1757 | << "most popular destination for conditional branches at " |
1758 | << ore::NV("CondBranchesLoc" , BranchLoc); |
1759 | }); |
1760 | } else { |
1761 | if (OverwriteExistingWeights) { |
1762 | TI->setMetadata(KindID: LLVMContext::MD_prof, Node: nullptr); |
1763 | LLVM_DEBUG(dbgs() << "CLEARED. All branch weights are zero.\n" ); |
1764 | } else { |
1765 | LLVM_DEBUG(dbgs() << "SKIPPED. All branch weights are zero.\n" ); |
1766 | } |
1767 | } |
1768 | } |
1769 | } |
1770 | |
1771 | /// Once all the branch weights are computed, we emit the MD_prof |
1772 | /// metadata on BB using the computed values for each of its branches. |
1773 | /// |
1774 | /// \param F The function to query. |
1775 | /// |
1776 | /// \returns true if \p F was modified. Returns false, otherwise. |
1777 | bool SampleProfileLoader::emitAnnotations(Function &F) { |
1778 | bool Changed = false; |
1779 | |
1780 | if (FunctionSamples::ProfileIsProbeBased) { |
1781 | LLVM_DEBUG({ |
1782 | if (!ProbeManager->getDesc(F)) |
1783 | dbgs() << "Probe descriptor missing for Function " << F.getName() |
1784 | << "\n" ; |
1785 | }); |
1786 | |
1787 | if (ProbeManager->profileIsValid(F, Samples: *Samples)) { |
1788 | ++NumMatchedProfile; |
1789 | } else { |
1790 | ++NumMismatchedProfile; |
1791 | LLVM_DEBUG( |
1792 | dbgs() << "Profile is invalid due to CFG mismatch for Function " |
1793 | << F.getName() << "\n" ); |
1794 | if (!SalvageStaleProfile) |
1795 | return false; |
1796 | } |
1797 | } else { |
1798 | if (getFunctionLoc(F) == 0) |
1799 | return false; |
1800 | |
1801 | LLVM_DEBUG(dbgs() << "Line number for the first instruction in " |
1802 | << F.getName() << ": " << getFunctionLoc(F) << "\n" ); |
1803 | } |
1804 | |
1805 | DenseSet<GlobalValue::GUID> InlinedGUIDs; |
1806 | if (CallsitePrioritizedInline) |
1807 | Changed |= inlineHotFunctionsWithPriority(F, InlinedGUIDs); |
1808 | else |
1809 | Changed |= inlineHotFunctions(F, InlinedGUIDs); |
1810 | |
1811 | Changed |= computeAndPropagateWeights(F, InlinedGUIDs); |
1812 | |
1813 | if (Changed) |
1814 | generateMDProfMetadata(F); |
1815 | |
1816 | emitCoverageRemarks(F); |
1817 | return Changed; |
1818 | } |
1819 | |
1820 | std::unique_ptr<ProfiledCallGraph> |
1821 | SampleProfileLoader::buildProfiledCallGraph(Module &M) { |
1822 | std::unique_ptr<ProfiledCallGraph> ProfiledCG; |
1823 | if (FunctionSamples::ProfileIsCS) |
1824 | ProfiledCG = std::make_unique<ProfiledCallGraph>(args&: *ContextTracker); |
1825 | else |
1826 | ProfiledCG = std::make_unique<ProfiledCallGraph>(args&: Reader->getProfiles()); |
1827 | |
1828 | // Add all functions into the profiled call graph even if they are not in |
1829 | // the profile. This makes sure functions missing from the profile still |
1830 | // gets a chance to be processed. |
1831 | for (Function &F : M) { |
1832 | if (skipProfileForFunction(F)) |
1833 | continue; |
1834 | ProfiledCG->addProfiledFunction( |
1835 | Name: getRepInFormat(Name: FunctionSamples::getCanonicalFnName(F))); |
1836 | } |
1837 | |
1838 | return ProfiledCG; |
1839 | } |
1840 | |
1841 | std::vector<Function *> |
1842 | SampleProfileLoader::buildFunctionOrder(Module &M, LazyCallGraph &CG) { |
1843 | std::vector<Function *> FunctionOrderList; |
1844 | FunctionOrderList.reserve(n: M.size()); |
1845 | |
1846 | if (!ProfileTopDownLoad && UseProfiledCallGraph) |
1847 | errs() << "WARNING: -use-profiled-call-graph ignored, should be used " |
1848 | "together with -sample-profile-top-down-load.\n" ; |
1849 | |
1850 | if (!ProfileTopDownLoad) { |
1851 | if (ProfileMergeInlinee) { |
1852 | // Disable ProfileMergeInlinee if profile is not loaded in top down order, |
1853 | // because the profile for a function may be used for the profile |
1854 | // annotation of its outline copy before the profile merging of its |
1855 | // non-inlined inline instances, and that is not the way how |
1856 | // ProfileMergeInlinee is supposed to work. |
1857 | ProfileMergeInlinee = false; |
1858 | } |
1859 | |
1860 | for (Function &F : M) |
1861 | if (!skipProfileForFunction(F)) |
1862 | FunctionOrderList.push_back(x: &F); |
1863 | return FunctionOrderList; |
1864 | } |
1865 | |
1866 | if (UseProfiledCallGraph || (FunctionSamples::ProfileIsCS && |
1867 | !UseProfiledCallGraph.getNumOccurrences())) { |
1868 | // Use profiled call edges to augment the top-down order. There are cases |
1869 | // that the top-down order computed based on the static call graph doesn't |
1870 | // reflect real execution order. For example |
1871 | // |
1872 | // 1. Incomplete static call graph due to unknown indirect call targets. |
1873 | // Adjusting the order by considering indirect call edges from the |
1874 | // profile can enable the inlining of indirect call targets by allowing |
1875 | // the caller processed before them. |
1876 | // 2. Mutual call edges in an SCC. The static processing order computed for |
1877 | // an SCC may not reflect the call contexts in the context-sensitive |
1878 | // profile, thus may cause potential inlining to be overlooked. The |
1879 | // function order in one SCC is being adjusted to a top-down order based |
1880 | // on the profile to favor more inlining. This is only a problem with CS |
1881 | // profile. |
1882 | // 3. Transitive indirect call edges due to inlining. When a callee function |
1883 | // (say B) is inlined into a caller function (say A) in LTO prelink, |
1884 | // every call edge originated from the callee B will be transferred to |
1885 | // the caller A. If any transferred edge (say A->C) is indirect, the |
1886 | // original profiled indirect edge B->C, even if considered, would not |
1887 | // enforce a top-down order from the caller A to the potential indirect |
1888 | // call target C in LTO postlink since the inlined callee B is gone from |
1889 | // the static call graph. |
1890 | // 4. #3 can happen even for direct call targets, due to functions defined |
1891 | // in header files. A header function (say A), when included into source |
1892 | // files, is defined multiple times but only one definition survives due |
1893 | // to ODR. Therefore, the LTO prelink inlining done on those dropped |
1894 | // definitions can be useless based on a local file scope. More |
1895 | // importantly, the inlinee (say B), once fully inlined to a |
1896 | // to-be-dropped A, will have no profile to consume when its outlined |
1897 | // version is compiled. This can lead to a profile-less prelink |
1898 | // compilation for the outlined version of B which may be called from |
1899 | // external modules. while this isn't easy to fix, we rely on the |
1900 | // postlink AutoFDO pipeline to optimize B. Since the survived copy of |
1901 | // the A can be inlined in its local scope in prelink, it may not exist |
1902 | // in the merged IR in postlink, and we'll need the profiled call edges |
1903 | // to enforce a top-down order for the rest of the functions. |
1904 | // |
1905 | // Considering those cases, a profiled call graph completely independent of |
1906 | // the static call graph is constructed based on profile data, where |
1907 | // function objects are not even needed to handle case #3 and case 4. |
1908 | // |
1909 | // Note that static callgraph edges are completely ignored since they |
1910 | // can be conflicting with profiled edges for cyclic SCCs and may result in |
1911 | // an SCC order incompatible with profile-defined one. Using strictly |
1912 | // profile order ensures a maximum inlining experience. On the other hand, |
1913 | // static call edges are not so important when they don't correspond to a |
1914 | // context in the profile. |
1915 | |
1916 | std::unique_ptr<ProfiledCallGraph> ProfiledCG = buildProfiledCallGraph(M); |
1917 | scc_iterator<ProfiledCallGraph *> CGI = scc_begin(G: ProfiledCG.get()); |
1918 | while (!CGI.isAtEnd()) { |
1919 | auto Range = *CGI; |
1920 | if (SortProfiledSCC) { |
1921 | // Sort nodes in one SCC based on callsite hotness. |
1922 | scc_member_iterator<ProfiledCallGraph *> SI(*CGI); |
1923 | Range = *SI; |
1924 | } |
1925 | for (auto *Node : Range) { |
1926 | Function *F = SymbolMap.lookup(Key: Node->Name); |
1927 | if (F && !skipProfileForFunction(F: *F)) |
1928 | FunctionOrderList.push_back(x: F); |
1929 | } |
1930 | ++CGI; |
1931 | } |
1932 | } else { |
1933 | CG.buildRefSCCs(); |
1934 | for (LazyCallGraph::RefSCC &RC : CG.postorder_ref_sccs()) { |
1935 | for (LazyCallGraph::SCC &C : RC) { |
1936 | for (LazyCallGraph::Node &N : C) { |
1937 | Function &F = N.getFunction(); |
1938 | if (!skipProfileForFunction(F)) |
1939 | FunctionOrderList.push_back(x: &F); |
1940 | } |
1941 | } |
1942 | } |
1943 | } |
1944 | |
1945 | std::reverse(first: FunctionOrderList.begin(), last: FunctionOrderList.end()); |
1946 | |
1947 | LLVM_DEBUG({ |
1948 | dbgs() << "Function processing order:\n" ; |
1949 | for (auto F : FunctionOrderList) { |
1950 | dbgs() << F->getName() << "\n" ; |
1951 | } |
1952 | }); |
1953 | |
1954 | return FunctionOrderList; |
1955 | } |
1956 | |
1957 | bool SampleProfileLoader::doInitialization(Module &M, |
1958 | FunctionAnalysisManager *FAM) { |
1959 | auto &Ctx = M.getContext(); |
1960 | |
1961 | auto ReaderOrErr = SampleProfileReader::create( |
1962 | Filename, C&: Ctx, FS&: *FS, P: FSDiscriminatorPass::Base, RemapFilename: RemappingFilename); |
1963 | if (std::error_code EC = ReaderOrErr.getError()) { |
1964 | std::string Msg = "Could not open profile: " + EC.message(); |
1965 | Ctx.diagnose(DI: DiagnosticInfoSampleProfile(Filename, Msg)); |
1966 | return false; |
1967 | } |
1968 | Reader = std::move(ReaderOrErr.get()); |
1969 | Reader->setSkipFlatProf(LTOPhase == ThinOrFullLTOPhase::ThinLTOPostLink); |
1970 | // set module before reading the profile so reader may be able to only |
1971 | // read the function profiles which are used by the current module. |
1972 | Reader->setModule(&M); |
1973 | if (std::error_code EC = Reader->read()) { |
1974 | std::string Msg = "profile reading failed: " + EC.message(); |
1975 | Ctx.diagnose(DI: DiagnosticInfoSampleProfile(Filename, Msg)); |
1976 | return false; |
1977 | } |
1978 | |
1979 | PSL = Reader->getProfileSymbolList(); |
1980 | |
1981 | // While profile-sample-accurate is on, ignore symbol list. |
1982 | ProfAccForSymsInList = |
1983 | ProfileAccurateForSymsInList && PSL && !ProfileSampleAccurate; |
1984 | if (ProfAccForSymsInList) { |
1985 | NamesInProfile.clear(); |
1986 | GUIDsInProfile.clear(); |
1987 | if (auto NameTable = Reader->getNameTable()) { |
1988 | if (FunctionSamples::UseMD5) { |
1989 | for (auto Name : *NameTable) |
1990 | GUIDsInProfile.insert(V: Name.getHashCode()); |
1991 | } else { |
1992 | for (auto Name : *NameTable) |
1993 | NamesInProfile.insert(key: Name.stringRef()); |
1994 | } |
1995 | } |
1996 | CoverageTracker.setProfAccForSymsInList(true); |
1997 | } |
1998 | |
1999 | if (FAM && !ProfileInlineReplayFile.empty()) { |
2000 | ExternalInlineAdvisor = getReplayInlineAdvisor( |
2001 | M, FAM&: *FAM, Context&: Ctx, /*OriginalAdvisor=*/nullptr, |
2002 | ReplaySettings: ReplayInlinerSettings{.ReplayFile: ProfileInlineReplayFile, |
2003 | .ReplayScope: ProfileInlineReplayScope, |
2004 | .ReplayFallback: ProfileInlineReplayFallback, |
2005 | .ReplayFormat: {.OutputFormat: ProfileInlineReplayFormat}}, |
2006 | /*EmitRemarks=*/false, IC: InlineContext{.LTOPhase: LTOPhase, .Pass: InlinePass::ReplaySampleProfileInliner}); |
2007 | } |
2008 | |
2009 | // Apply tweaks if context-sensitive or probe-based profile is available. |
2010 | if (Reader->profileIsCS() || Reader->profileIsPreInlined() || |
2011 | Reader->profileIsProbeBased()) { |
2012 | if (!UseIterativeBFIInference.getNumOccurrences()) |
2013 | UseIterativeBFIInference = true; |
2014 | if (!SampleProfileUseProfi.getNumOccurrences()) |
2015 | SampleProfileUseProfi = true; |
2016 | if (!EnableExtTspBlockPlacement.getNumOccurrences()) |
2017 | EnableExtTspBlockPlacement = true; |
2018 | // Enable priority-base inliner and size inline by default for CSSPGO. |
2019 | if (!ProfileSizeInline.getNumOccurrences()) |
2020 | ProfileSizeInline = true; |
2021 | if (!CallsitePrioritizedInline.getNumOccurrences()) |
2022 | CallsitePrioritizedInline = true; |
2023 | // For CSSPGO, we also allow recursive inline to best use context profile. |
2024 | if (!AllowRecursiveInline.getNumOccurrences()) |
2025 | AllowRecursiveInline = true; |
2026 | |
2027 | if (Reader->profileIsPreInlined()) { |
2028 | if (!UsePreInlinerDecision.getNumOccurrences()) |
2029 | UsePreInlinerDecision = true; |
2030 | } |
2031 | |
2032 | // Enable stale profile matching by default for probe-based profile. |
2033 | // Currently the matching relies on if the checksum mismatch is detected, |
2034 | // which is currently only available for pseudo-probe mode. Removing the |
2035 | // checksum check could cause regressions for some cases, so further tuning |
2036 | // might be needed if we want to enable it for all cases. |
2037 | if (Reader->profileIsProbeBased() && |
2038 | !SalvageStaleProfile.getNumOccurrences()) { |
2039 | SalvageStaleProfile = true; |
2040 | } |
2041 | |
2042 | if (!Reader->profileIsCS()) { |
2043 | // Non-CS profile should be fine without a function size budget for the |
2044 | // inliner since the contexts in the profile are either all from inlining |
2045 | // in the prevoius build or pre-computed by the preinliner with a size |
2046 | // cap, thus they are bounded. |
2047 | if (!ProfileInlineLimitMin.getNumOccurrences()) |
2048 | ProfileInlineLimitMin = std::numeric_limits<unsigned>::max(); |
2049 | if (!ProfileInlineLimitMax.getNumOccurrences()) |
2050 | ProfileInlineLimitMax = std::numeric_limits<unsigned>::max(); |
2051 | } |
2052 | } |
2053 | |
2054 | if (Reader->profileIsCS()) { |
2055 | // Tracker for profiles under different context |
2056 | ContextTracker = std::make_unique<SampleContextTracker>( |
2057 | args&: Reader->getProfiles(), args: &GUIDToFuncNameMap); |
2058 | } |
2059 | |
2060 | // Load pseudo probe descriptors for probe-based function samples. |
2061 | if (Reader->profileIsProbeBased()) { |
2062 | ProbeManager = std::make_unique<PseudoProbeManager>(args&: M); |
2063 | if (!ProbeManager->moduleIsProbed(M)) { |
2064 | const char *Msg = |
2065 | "Pseudo-probe-based profile requires SampleProfileProbePass" ; |
2066 | Ctx.diagnose(DI: DiagnosticInfoSampleProfile(M.getModuleIdentifier(), Msg, |
2067 | DS_Warning)); |
2068 | return false; |
2069 | } |
2070 | } |
2071 | |
2072 | if (ReportProfileStaleness || PersistProfileStaleness || |
2073 | SalvageStaleProfile) { |
2074 | MatchingManager = std::make_unique<SampleProfileMatcher>( |
2075 | args&: M, args&: *Reader, args: ProbeManager.get(), args: LTOPhase); |
2076 | } |
2077 | |
2078 | return true; |
2079 | } |
2080 | |
2081 | // Note that this is a module-level check. Even if one module is errored out, |
2082 | // the entire build will be errored out. However, the user could make big |
2083 | // changes to functions in single module but those changes might not be |
2084 | // performance significant to the whole binary. Therefore, to avoid those false |
2085 | // positives, we select a reasonable big set of hot functions that are supposed |
2086 | // to be globally performance significant, only compute and check the mismatch |
2087 | // within those functions. The function selection is based on two criteria: |
2088 | // 1) The function is hot enough, which is tuned by a hotness-based |
2089 | // flag(HotFuncCutoffForStalenessError). 2) The num of function is large enough |
2090 | // which is tuned by the MinfuncsForStalenessError flag. |
2091 | bool SampleProfileLoader::rejectHighStalenessProfile( |
2092 | Module &M, ProfileSummaryInfo *PSI, const SampleProfileMap &Profiles) { |
2093 | assert(FunctionSamples::ProfileIsProbeBased && |
2094 | "Only support for probe-based profile" ); |
2095 | uint64_t TotalHotFunc = 0; |
2096 | uint64_t NumMismatchedFunc = 0; |
2097 | for (const auto &I : Profiles) { |
2098 | const auto &FS = I.second; |
2099 | const auto *FuncDesc = ProbeManager->getDesc(GUID: FS.getGUID()); |
2100 | if (!FuncDesc) |
2101 | continue; |
2102 | |
2103 | // Use a hotness-based threshold to control the function selection. |
2104 | if (!PSI->isHotCountNthPercentile(PercentileCutoff: HotFuncCutoffForStalenessError, |
2105 | C: FS.getTotalSamples())) |
2106 | continue; |
2107 | |
2108 | TotalHotFunc++; |
2109 | if (ProbeManager->profileIsHashMismatched(FuncDesc: *FuncDesc, Samples: FS)) |
2110 | NumMismatchedFunc++; |
2111 | } |
2112 | // Make sure that the num of selected function is not too small to distinguish |
2113 | // from the user's benign changes. |
2114 | if (TotalHotFunc < MinfuncsForStalenessError) |
2115 | return false; |
2116 | |
2117 | // Finally check the mismatch percentage against the threshold. |
2118 | if (NumMismatchedFunc * 100 >= |
2119 | TotalHotFunc * PrecentMismatchForStalenessError) { |
2120 | auto &Ctx = M.getContext(); |
2121 | const char *Msg = |
2122 | "The input profile significantly mismatches current source code. " |
2123 | "Please recollect profile to avoid performance regression." ; |
2124 | Ctx.diagnose(DI: DiagnosticInfoSampleProfile(M.getModuleIdentifier(), Msg)); |
2125 | return true; |
2126 | } |
2127 | return false; |
2128 | } |
2129 | |
2130 | bool SampleProfileLoader::runOnModule(Module &M, ModuleAnalysisManager *AM, |
2131 | ProfileSummaryInfo *_PSI, |
2132 | LazyCallGraph &CG) { |
2133 | GUIDToFuncNameMapper Mapper(M, *Reader, GUIDToFuncNameMap); |
2134 | |
2135 | PSI = _PSI; |
2136 | if (M.getProfileSummary(/* IsCS */ false) == nullptr) { |
2137 | M.setProfileSummary(M: Reader->getSummary().getMD(Context&: M.getContext()), |
2138 | Kind: ProfileSummary::PSK_Sample); |
2139 | PSI->refresh(); |
2140 | } |
2141 | |
2142 | if (FunctionSamples::ProfileIsProbeBased && |
2143 | rejectHighStalenessProfile(M, PSI, Profiles: Reader->getProfiles())) |
2144 | return false; |
2145 | |
2146 | // Compute the total number of samples collected in this profile. |
2147 | for (const auto &I : Reader->getProfiles()) |
2148 | TotalCollectedSamples += I.second.getTotalSamples(); |
2149 | |
2150 | auto Remapper = Reader->getRemapper(); |
2151 | // Populate the symbol map. |
2152 | for (const auto &N_F : M.getValueSymbolTable()) { |
2153 | StringRef OrigName = N_F.getKey(); |
2154 | Function *F = dyn_cast<Function>(Val: N_F.getValue()); |
2155 | if (F == nullptr || OrigName.empty()) |
2156 | continue; |
2157 | SymbolMap[FunctionId(OrigName)] = F; |
2158 | StringRef NewName = FunctionSamples::getCanonicalFnName(F: *F); |
2159 | if (OrigName != NewName && !NewName.empty()) { |
2160 | auto r = SymbolMap.emplace(Args: FunctionId(NewName), Args&: F); |
2161 | // Failiing to insert means there is already an entry in SymbolMap, |
2162 | // thus there are multiple functions that are mapped to the same |
2163 | // stripped name. In this case of name conflicting, set the value |
2164 | // to nullptr to avoid confusion. |
2165 | if (!r.second) |
2166 | r.first->second = nullptr; |
2167 | OrigName = NewName; |
2168 | } |
2169 | // Insert the remapped names into SymbolMap. |
2170 | if (Remapper) { |
2171 | if (auto MapName = Remapper->lookUpNameInProfile(FunctionName: OrigName)) { |
2172 | if (*MapName != OrigName && !MapName->empty()) |
2173 | SymbolMap.emplace(Args: FunctionId(*MapName), Args&: F); |
2174 | } |
2175 | } |
2176 | } |
2177 | assert(SymbolMap.count(FunctionId()) == 0 && |
2178 | "No empty StringRef should be added in SymbolMap" ); |
2179 | |
2180 | if (ReportProfileStaleness || PersistProfileStaleness || |
2181 | SalvageStaleProfile) { |
2182 | MatchingManager->runOnModule(); |
2183 | MatchingManager->clearMatchingData(); |
2184 | } |
2185 | |
2186 | bool retval = false; |
2187 | for (auto *F : buildFunctionOrder(M, CG)) { |
2188 | assert(!F->isDeclaration()); |
2189 | clearFunctionData(); |
2190 | retval |= runOnFunction(F&: *F, AM); |
2191 | } |
2192 | |
2193 | // Account for cold calls not inlined.... |
2194 | if (!FunctionSamples::ProfileIsCS) |
2195 | for (const std::pair<Function *, NotInlinedProfileInfo> &pair : |
2196 | notInlinedCallInfo) |
2197 | updateProfileCallee(Callee: pair.first, EntryDelta: pair.second.entryCount); |
2198 | |
2199 | return retval; |
2200 | } |
2201 | |
2202 | bool SampleProfileLoader::runOnFunction(Function &F, ModuleAnalysisManager *AM) { |
2203 | LLVM_DEBUG(dbgs() << "\n\nProcessing Function " << F.getName() << "\n" ); |
2204 | DILocation2SampleMap.clear(); |
2205 | // By default the entry count is initialized to -1, which will be treated |
2206 | // conservatively by getEntryCount as the same as unknown (None). This is |
2207 | // to avoid newly added code to be treated as cold. If we have samples |
2208 | // this will be overwritten in emitAnnotations. |
2209 | uint64_t initialEntryCount = -1; |
2210 | |
2211 | ProfAccForSymsInList = ProfileAccurateForSymsInList && PSL; |
2212 | if (ProfileSampleAccurate || F.hasFnAttribute(Kind: "profile-sample-accurate" )) { |
2213 | // initialize all the function entry counts to 0. It means all the |
2214 | // functions without profile will be regarded as cold. |
2215 | initialEntryCount = 0; |
2216 | // profile-sample-accurate is a user assertion which has a higher precedence |
2217 | // than symbol list. When profile-sample-accurate is on, ignore symbol list. |
2218 | ProfAccForSymsInList = false; |
2219 | } |
2220 | CoverageTracker.setProfAccForSymsInList(ProfAccForSymsInList); |
2221 | |
2222 | // PSL -- profile symbol list include all the symbols in sampled binary. |
2223 | // If ProfileAccurateForSymsInList is enabled, PSL is used to treat |
2224 | // old functions without samples being cold, without having to worry |
2225 | // about new and hot functions being mistakenly treated as cold. |
2226 | if (ProfAccForSymsInList) { |
2227 | // Initialize the entry count to 0 for functions in the list. |
2228 | if (PSL->contains(Name: F.getName())) |
2229 | initialEntryCount = 0; |
2230 | |
2231 | // Function in the symbol list but without sample will be regarded as |
2232 | // cold. To minimize the potential negative performance impact it could |
2233 | // have, we want to be a little conservative here saying if a function |
2234 | // shows up in the profile, no matter as outline function, inline instance |
2235 | // or call targets, treat the function as not being cold. This will handle |
2236 | // the cases such as most callsites of a function are inlined in sampled |
2237 | // binary but not inlined in current build (because of source code drift, |
2238 | // imprecise debug information, or the callsites are all cold individually |
2239 | // but not cold accumulatively...), so the outline function showing up as |
2240 | // cold in sampled binary will actually not be cold after current build. |
2241 | StringRef CanonName = FunctionSamples::getCanonicalFnName(F); |
2242 | if ((FunctionSamples::UseMD5 && |
2243 | GUIDsInProfile.count(V: Function::getGUID(GlobalName: CanonName))) || |
2244 | (!FunctionSamples::UseMD5 && NamesInProfile.count(Key: CanonName))) |
2245 | initialEntryCount = -1; |
2246 | } |
2247 | |
2248 | // Initialize entry count when the function has no existing entry |
2249 | // count value. |
2250 | if (!F.getEntryCount()) |
2251 | F.setEntryCount(Count: ProfileCount(initialEntryCount, Function::PCT_Real)); |
2252 | std::unique_ptr<OptimizationRemarkEmitter> OwnedORE; |
2253 | if (AM) { |
2254 | auto &FAM = |
2255 | AM->getResult<FunctionAnalysisManagerModuleProxy>(IR&: *F.getParent()) |
2256 | .getManager(); |
2257 | ORE = &FAM.getResult<OptimizationRemarkEmitterAnalysis>(IR&: F); |
2258 | } else { |
2259 | OwnedORE = std::make_unique<OptimizationRemarkEmitter>(args: &F); |
2260 | ORE = OwnedORE.get(); |
2261 | } |
2262 | |
2263 | if (FunctionSamples::ProfileIsCS) |
2264 | Samples = ContextTracker->getBaseSamplesFor(Func: F); |
2265 | else { |
2266 | Samples = Reader->getSamplesFor(F); |
2267 | // Try search in previously inlined functions that were split or duplicated |
2268 | // into base. |
2269 | if (!Samples) { |
2270 | StringRef CanonName = FunctionSamples::getCanonicalFnName(F); |
2271 | auto It = OutlineFunctionSamples.find(x: FunctionId(CanonName)); |
2272 | if (It != OutlineFunctionSamples.end()) { |
2273 | Samples = &It->second; |
2274 | } else if (auto Remapper = Reader->getRemapper()) { |
2275 | if (auto RemppedName = Remapper->lookUpNameInProfile(FunctionName: CanonName)) { |
2276 | It = OutlineFunctionSamples.find(x: FunctionId(*RemppedName)); |
2277 | if (It != OutlineFunctionSamples.end()) |
2278 | Samples = &It->second; |
2279 | } |
2280 | } |
2281 | } |
2282 | } |
2283 | |
2284 | if (Samples && !Samples->empty()) |
2285 | return emitAnnotations(F); |
2286 | return false; |
2287 | } |
2288 | SampleProfileLoaderPass::SampleProfileLoaderPass( |
2289 | std::string File, std::string RemappingFile, ThinOrFullLTOPhase LTOPhase, |
2290 | IntrusiveRefCntPtr<vfs::FileSystem> FS) |
2291 | : ProfileFileName(File), ProfileRemappingFileName(RemappingFile), |
2292 | LTOPhase(LTOPhase), FS(std::move(FS)) {} |
2293 | |
2294 | PreservedAnalyses SampleProfileLoaderPass::run(Module &M, |
2295 | ModuleAnalysisManager &AM) { |
2296 | FunctionAnalysisManager &FAM = |
2297 | AM.getResult<FunctionAnalysisManagerModuleProxy>(IR&: M).getManager(); |
2298 | |
2299 | auto GetAssumptionCache = [&](Function &F) -> AssumptionCache & { |
2300 | return FAM.getResult<AssumptionAnalysis>(IR&: F); |
2301 | }; |
2302 | auto GetTTI = [&](Function &F) -> TargetTransformInfo & { |
2303 | return FAM.getResult<TargetIRAnalysis>(IR&: F); |
2304 | }; |
2305 | auto GetTLI = [&](Function &F) -> const TargetLibraryInfo & { |
2306 | return FAM.getResult<TargetLibraryAnalysis>(IR&: F); |
2307 | }; |
2308 | |
2309 | if (!FS) |
2310 | FS = vfs::getRealFileSystem(); |
2311 | |
2312 | SampleProfileLoader SampleLoader( |
2313 | ProfileFileName.empty() ? SampleProfileFile : ProfileFileName, |
2314 | ProfileRemappingFileName.empty() ? SampleProfileRemappingFile |
2315 | : ProfileRemappingFileName, |
2316 | LTOPhase, FS, GetAssumptionCache, GetTTI, GetTLI); |
2317 | |
2318 | if (!SampleLoader.doInitialization(M, FAM: &FAM)) |
2319 | return PreservedAnalyses::all(); |
2320 | |
2321 | ProfileSummaryInfo *PSI = &AM.getResult<ProfileSummaryAnalysis>(IR&: M); |
2322 | LazyCallGraph &CG = AM.getResult<LazyCallGraphAnalysis>(IR&: M); |
2323 | if (!SampleLoader.runOnModule(M, AM: &AM, PSI: PSI, CG)) |
2324 | return PreservedAnalyses::all(); |
2325 | |
2326 | return PreservedAnalyses::none(); |
2327 | } |
2328 | |