1 | //==-- MemProfContextDisambiguation.cpp - Disambiguate contexts -------------=// |
2 | // |
3 | // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. |
4 | // See https://llvm.org/LICENSE.txt for license information. |
5 | // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception |
6 | // |
7 | //===----------------------------------------------------------------------===// |
8 | // |
9 | // This file implements support for context disambiguation of allocation |
10 | // calls for profile guided heap optimization. Specifically, it uses Memprof |
11 | // profiles which indicate context specific allocation behavior (currently |
12 | // distinguishing cold vs hot memory allocations). Cloning is performed to |
13 | // expose the cold allocation call contexts, and the allocation calls are |
14 | // subsequently annotated with an attribute for later transformation. |
15 | // |
16 | // The transformations can be performed either directly on IR (regular LTO), or |
17 | // on a ThinLTO index (and later applied to the IR during the ThinLTO backend). |
18 | // Both types of LTO operate on a the same base graph representation, which |
19 | // uses CRTP to support either IR or Index formats. |
20 | // |
21 | //===----------------------------------------------------------------------===// |
22 | |
23 | #include "llvm/Transforms/IPO/MemProfContextDisambiguation.h" |
24 | #include "llvm/ADT/DenseMap.h" |
25 | #include "llvm/ADT/DenseSet.h" |
26 | #include "llvm/ADT/MapVector.h" |
27 | #include "llvm/ADT/SetOperations.h" |
28 | #include "llvm/ADT/SmallPtrSet.h" |
29 | #include "llvm/ADT/SmallSet.h" |
30 | #include "llvm/ADT/SmallVector.h" |
31 | #include "llvm/ADT/Statistic.h" |
32 | #include "llvm/Analysis/MemoryProfileInfo.h" |
33 | #include "llvm/Analysis/ModuleSummaryAnalysis.h" |
34 | #include "llvm/Analysis/OptimizationRemarkEmitter.h" |
35 | #include "llvm/Bitcode/BitcodeReader.h" |
36 | #include "llvm/IR/Constants.h" |
37 | #include "llvm/IR/Instructions.h" |
38 | #include "llvm/IR/Module.h" |
39 | #include "llvm/IR/ModuleSummaryIndex.h" |
40 | #include "llvm/Pass.h" |
41 | #include "llvm/Support/CommandLine.h" |
42 | #include "llvm/Support/FileSystem.h" |
43 | #include "llvm/Support/GraphWriter.h" |
44 | #include "llvm/Support/raw_ostream.h" |
45 | #include "llvm/Transforms/IPO.h" |
46 | #include "llvm/Transforms/Utils/Cloning.h" |
47 | #include <sstream> |
48 | #include <unordered_map> |
49 | #include <vector> |
50 | using namespace llvm; |
51 | using namespace llvm::memprof; |
52 | |
53 | #define DEBUG_TYPE "memprof-context-disambiguation" |
54 | |
55 | STATISTIC(FunctionClonesAnalysis, |
56 | "Number of function clones created during whole program analysis" ); |
57 | STATISTIC(FunctionClonesThinBackend, |
58 | "Number of function clones created during ThinLTO backend" ); |
59 | STATISTIC(FunctionsClonedThinBackend, |
60 | "Number of functions that had clones created during ThinLTO backend" ); |
61 | STATISTIC(AllocTypeNotCold, "Number of not cold static allocations (possibly " |
62 | "cloned) during whole program analysis" ); |
63 | STATISTIC(AllocTypeCold, "Number of cold static allocations (possibly cloned) " |
64 | "during whole program analysis" ); |
65 | STATISTIC(AllocTypeNotColdThinBackend, |
66 | "Number of not cold static allocations (possibly cloned) during " |
67 | "ThinLTO backend" ); |
68 | STATISTIC(AllocTypeColdThinBackend, "Number of cold static allocations " |
69 | "(possibly cloned) during ThinLTO backend" ); |
70 | STATISTIC(OrigAllocsThinBackend, |
71 | "Number of original (not cloned) allocations with memprof profiles " |
72 | "during ThinLTO backend" ); |
73 | STATISTIC( |
74 | AllocVersionsThinBackend, |
75 | "Number of allocation versions (including clones) during ThinLTO backend" ); |
76 | STATISTIC(MaxAllocVersionsThinBackend, |
77 | "Maximum number of allocation versions created for an original " |
78 | "allocation during ThinLTO backend" ); |
79 | STATISTIC(UnclonableAllocsThinBackend, |
80 | "Number of unclonable ambigous allocations during ThinLTO backend" ); |
81 | STATISTIC(RemovedEdgesWithMismatchedCallees, |
82 | "Number of edges removed due to mismatched callees (profiled vs IR)" ); |
83 | STATISTIC(FoundProfiledCalleeCount, |
84 | "Number of profiled callees found via tail calls" ); |
85 | STATISTIC(FoundProfiledCalleeDepth, |
86 | "Aggregate depth of profiled callees found via tail calls" ); |
87 | STATISTIC(FoundProfiledCalleeMaxDepth, |
88 | "Maximum depth of profiled callees found via tail calls" ); |
89 | STATISTIC(FoundProfiledCalleeNonUniquelyCount, |
90 | "Number of profiled callees found via multiple tail call chains" ); |
91 | |
92 | static cl::opt<std::string> DotFilePathPrefix( |
93 | "memprof-dot-file-path-prefix" , cl::init(Val: "" ), cl::Hidden, |
94 | cl::value_desc("filename" ), |
95 | cl::desc("Specify the path prefix of the MemProf dot files." )); |
96 | |
97 | static cl::opt<bool> ExportToDot("memprof-export-to-dot" , cl::init(Val: false), |
98 | cl::Hidden, |
99 | cl::desc("Export graph to dot files." )); |
100 | |
101 | static cl::opt<bool> |
102 | DumpCCG("memprof-dump-ccg" , cl::init(Val: false), cl::Hidden, |
103 | cl::desc("Dump CallingContextGraph to stdout after each stage." )); |
104 | |
105 | static cl::opt<bool> |
106 | VerifyCCG("memprof-verify-ccg" , cl::init(Val: false), cl::Hidden, |
107 | cl::desc("Perform verification checks on CallingContextGraph." )); |
108 | |
109 | static cl::opt<bool> |
110 | VerifyNodes("memprof-verify-nodes" , cl::init(Val: false), cl::Hidden, |
111 | cl::desc("Perform frequent verification checks on nodes." )); |
112 | |
113 | static cl::opt<std::string> MemProfImportSummary( |
114 | "memprof-import-summary" , |
115 | cl::desc("Import summary to use for testing the ThinLTO backend via opt" ), |
116 | cl::Hidden); |
117 | |
118 | static cl::opt<unsigned> |
119 | TailCallSearchDepth("memprof-tail-call-search-depth" , cl::init(Val: 5), |
120 | cl::Hidden, |
121 | cl::desc("Max depth to recursively search for missing " |
122 | "frames through tail calls." )); |
123 | |
124 | namespace llvm { |
125 | cl::opt<bool> EnableMemProfContextDisambiguation( |
126 | "enable-memprof-context-disambiguation" , cl::init(Val: false), cl::Hidden, |
127 | cl::ZeroOrMore, cl::desc("Enable MemProf context disambiguation" )); |
128 | |
129 | // Indicate we are linking with an allocator that supports hot/cold operator |
130 | // new interfaces. |
131 | cl::opt<bool> SupportsHotColdNew( |
132 | "supports-hot-cold-new" , cl::init(Val: false), cl::Hidden, |
133 | cl::desc("Linking with hot/cold operator new interfaces" )); |
134 | } // namespace llvm |
135 | |
136 | namespace { |
137 | /// CRTP base for graphs built from either IR or ThinLTO summary index. |
138 | /// |
139 | /// The graph represents the call contexts in all memprof metadata on allocation |
140 | /// calls, with nodes for the allocations themselves, as well as for the calls |
141 | /// in each context. The graph is initially built from the allocation memprof |
142 | /// metadata (or summary) MIBs. It is then updated to match calls with callsite |
143 | /// metadata onto the nodes, updating it to reflect any inlining performed on |
144 | /// those calls. |
145 | /// |
146 | /// Each MIB (representing an allocation's call context with allocation |
147 | /// behavior) is assigned a unique context id during the graph build. The edges |
148 | /// and nodes in the graph are decorated with the context ids they carry. This |
149 | /// is used to correctly update the graph when cloning is performed so that we |
150 | /// can uniquify the context for a single (possibly cloned) allocation. |
151 | template <typename DerivedCCG, typename FuncTy, typename CallTy> |
152 | class CallsiteContextGraph { |
153 | public: |
154 | CallsiteContextGraph() = default; |
155 | CallsiteContextGraph(const CallsiteContextGraph &) = default; |
156 | CallsiteContextGraph(CallsiteContextGraph &&) = default; |
157 | |
158 | /// Main entry point to perform analysis and transformations on graph. |
159 | bool process(); |
160 | |
161 | /// Perform cloning on the graph necessary to uniquely identify the allocation |
162 | /// behavior of an allocation based on its context. |
163 | void identifyClones(); |
164 | |
165 | /// Assign callsite clones to functions, cloning functions as needed to |
166 | /// accommodate the combinations of their callsite clones reached by callers. |
167 | /// For regular LTO this clones functions and callsites in the IR, but for |
168 | /// ThinLTO the cloning decisions are noted in the summaries and later applied |
169 | /// in applyImport. |
170 | bool assignFunctions(); |
171 | |
172 | void dump() const; |
173 | void print(raw_ostream &OS) const; |
174 | |
175 | friend raw_ostream &operator<<(raw_ostream &OS, |
176 | const CallsiteContextGraph &CCG) { |
177 | CCG.print(OS); |
178 | return OS; |
179 | } |
180 | |
181 | friend struct GraphTraits< |
182 | const CallsiteContextGraph<DerivedCCG, FuncTy, CallTy> *>; |
183 | friend struct DOTGraphTraits< |
184 | const CallsiteContextGraph<DerivedCCG, FuncTy, CallTy> *>; |
185 | |
186 | void exportToDot(std::string Label) const; |
187 | |
188 | /// Represents a function clone via FuncTy pointer and clone number pair. |
189 | struct FuncInfo final |
190 | : public std::pair<FuncTy *, unsigned /*Clone number*/> { |
191 | using Base = std::pair<FuncTy *, unsigned>; |
192 | FuncInfo(const Base &B) : Base(B) {} |
193 | FuncInfo(FuncTy *F = nullptr, unsigned CloneNo = 0) : Base(F, CloneNo) {} |
194 | explicit operator bool() const { return this->first != nullptr; } |
195 | FuncTy *func() const { return this->first; } |
196 | unsigned cloneNo() const { return this->second; } |
197 | }; |
198 | |
199 | /// Represents a callsite clone via CallTy and clone number pair. |
200 | struct CallInfo final : public std::pair<CallTy, unsigned /*Clone number*/> { |
201 | using Base = std::pair<CallTy, unsigned>; |
202 | CallInfo(const Base &B) : Base(B) {} |
203 | CallInfo(CallTy Call = nullptr, unsigned CloneNo = 0) |
204 | : Base(Call, CloneNo) {} |
205 | explicit operator bool() const { return (bool)this->first; } |
206 | CallTy call() const { return this->first; } |
207 | unsigned cloneNo() const { return this->second; } |
208 | void setCloneNo(unsigned N) { this->second = N; } |
209 | void print(raw_ostream &OS) const { |
210 | if (!operator bool()) { |
211 | assert(!cloneNo()); |
212 | OS << "null Call" ; |
213 | return; |
214 | } |
215 | call()->print(OS); |
216 | OS << "\t(clone " << cloneNo() << ")" ; |
217 | } |
218 | void dump() const { |
219 | print(OS&: dbgs()); |
220 | dbgs() << "\n" ; |
221 | } |
222 | friend raw_ostream &operator<<(raw_ostream &OS, const CallInfo &Call) { |
223 | Call.print(OS); |
224 | return OS; |
225 | } |
226 | }; |
227 | |
228 | struct ContextEdge; |
229 | |
230 | /// Node in the Callsite Context Graph |
231 | struct ContextNode { |
232 | // Keep this for now since in the IR case where we have an Instruction* it |
233 | // is not as immediately discoverable. Used for printing richer information |
234 | // when dumping graph. |
235 | bool IsAllocation; |
236 | |
237 | // Keeps track of when the Call was reset to null because there was |
238 | // recursion. |
239 | bool Recursive = false; |
240 | |
241 | // The corresponding allocation or interior call. |
242 | CallInfo Call; |
243 | |
244 | // For alloc nodes this is a unique id assigned when constructed, and for |
245 | // callsite stack nodes it is the original stack id when the node is |
246 | // constructed from the memprof MIB metadata on the alloc nodes. Note that |
247 | // this is only used when matching callsite metadata onto the stack nodes |
248 | // created when processing the allocation memprof MIBs, and for labeling |
249 | // nodes in the dot graph. Therefore we don't bother to assign a value for |
250 | // clones. |
251 | uint64_t OrigStackOrAllocId = 0; |
252 | |
253 | // This will be formed by ORing together the AllocationType enum values |
254 | // for contexts including this node. |
255 | uint8_t AllocTypes = 0; |
256 | |
257 | // Edges to all callees in the profiled call stacks. |
258 | // TODO: Should this be a map (from Callee node) for more efficient lookup? |
259 | std::vector<std::shared_ptr<ContextEdge>> CalleeEdges; |
260 | |
261 | // Edges to all callers in the profiled call stacks. |
262 | // TODO: Should this be a map (from Caller node) for more efficient lookup? |
263 | std::vector<std::shared_ptr<ContextEdge>> CallerEdges; |
264 | |
265 | // The set of IDs for contexts including this node. |
266 | DenseSet<uint32_t> ContextIds; |
267 | |
268 | // List of clones of this ContextNode, initially empty. |
269 | std::vector<ContextNode *> Clones; |
270 | |
271 | // If a clone, points to the original uncloned node. |
272 | ContextNode *CloneOf = nullptr; |
273 | |
274 | ContextNode(bool IsAllocation) : IsAllocation(IsAllocation), Call() {} |
275 | |
276 | ContextNode(bool IsAllocation, CallInfo C) |
277 | : IsAllocation(IsAllocation), Call(C) {} |
278 | |
279 | void addClone(ContextNode *Clone) { |
280 | if (CloneOf) { |
281 | CloneOf->Clones.push_back(Clone); |
282 | Clone->CloneOf = CloneOf; |
283 | } else { |
284 | Clones.push_back(Clone); |
285 | assert(!Clone->CloneOf); |
286 | Clone->CloneOf = this; |
287 | } |
288 | } |
289 | |
290 | ContextNode *getOrigNode() { |
291 | if (!CloneOf) |
292 | return this; |
293 | return CloneOf; |
294 | } |
295 | |
296 | void addOrUpdateCallerEdge(ContextNode *Caller, AllocationType AllocType, |
297 | unsigned int ContextId); |
298 | |
299 | ContextEdge *findEdgeFromCallee(const ContextNode *Callee); |
300 | ContextEdge *findEdgeFromCaller(const ContextNode *Caller); |
301 | void eraseCalleeEdge(const ContextEdge *Edge); |
302 | void eraseCallerEdge(const ContextEdge *Edge); |
303 | |
304 | void setCall(CallInfo C) { Call = C; } |
305 | |
306 | bool hasCall() const { return (bool)Call.call(); } |
307 | |
308 | void printCall(raw_ostream &OS) const { Call.print(OS); } |
309 | |
310 | // True if this node was effectively removed from the graph, in which case |
311 | // its context id set, caller edges, and callee edges should all be empty. |
312 | bool isRemoved() const { |
313 | // Note that we can have non-empty context ids with empty caller and |
314 | // callee edges if the graph ends up with a single node. |
315 | if (ContextIds.empty()) |
316 | assert(CalleeEdges.empty() && CallerEdges.empty() && |
317 | "Context ids empty but at least one of callee and caller edges " |
318 | "were not!" ); |
319 | return ContextIds.empty(); |
320 | } |
321 | |
322 | void dump() const; |
323 | void print(raw_ostream &OS) const; |
324 | |
325 | friend raw_ostream &operator<<(raw_ostream &OS, const ContextNode &Node) { |
326 | Node.print(OS); |
327 | return OS; |
328 | } |
329 | }; |
330 | |
331 | /// Edge in the Callsite Context Graph from a ContextNode N to a caller or |
332 | /// callee. |
333 | struct ContextEdge { |
334 | ContextNode *Callee; |
335 | ContextNode *Caller; |
336 | |
337 | // This will be formed by ORing together the AllocationType enum values |
338 | // for contexts including this edge. |
339 | uint8_t AllocTypes = 0; |
340 | |
341 | // The set of IDs for contexts including this edge. |
342 | DenseSet<uint32_t> ContextIds; |
343 | |
344 | ContextEdge(ContextNode *Callee, ContextNode *Caller, uint8_t AllocType, |
345 | DenseSet<uint32_t> ContextIds) |
346 | : Callee(Callee), Caller(Caller), AllocTypes(AllocType), |
347 | ContextIds(ContextIds) {} |
348 | |
349 | DenseSet<uint32_t> &getContextIds() { return ContextIds; } |
350 | |
351 | void dump() const; |
352 | void print(raw_ostream &OS) const; |
353 | |
354 | friend raw_ostream &operator<<(raw_ostream &OS, const ContextEdge &Edge) { |
355 | Edge.print(OS); |
356 | return OS; |
357 | } |
358 | }; |
359 | |
360 | /// Helpers to remove callee edges that have allocation type None (due to not |
361 | /// carrying any context ids) after transformations. |
362 | void removeNoneTypeCalleeEdges(ContextNode *Node); |
363 | void |
364 | recursivelyRemoveNoneTypeCalleeEdges(ContextNode *Node, |
365 | DenseSet<const ContextNode *> &Visited); |
366 | |
367 | protected: |
368 | /// Get a list of nodes corresponding to the stack ids in the given callsite |
369 | /// context. |
370 | template <class NodeT, class IteratorT> |
371 | std::vector<uint64_t> |
372 | getStackIdsWithContextNodes(CallStack<NodeT, IteratorT> &CallsiteContext); |
373 | |
374 | /// Adds nodes for the given allocation and any stack ids on its memprof MIB |
375 | /// metadata (or summary). |
376 | ContextNode *addAllocNode(CallInfo Call, const FuncTy *F); |
377 | |
378 | /// Adds nodes for the given MIB stack ids. |
379 | template <class NodeT, class IteratorT> |
380 | void addStackNodesForMIB(ContextNode *AllocNode, |
381 | CallStack<NodeT, IteratorT> &StackContext, |
382 | CallStack<NodeT, IteratorT> &CallsiteContext, |
383 | AllocationType AllocType); |
384 | |
385 | /// Matches all callsite metadata (or summary) to the nodes created for |
386 | /// allocation memprof MIB metadata, synthesizing new nodes to reflect any |
387 | /// inlining performed on those callsite instructions. |
388 | void updateStackNodes(); |
389 | |
390 | /// Update graph to conservatively handle any callsite stack nodes that target |
391 | /// multiple different callee target functions. |
392 | void handleCallsitesWithMultipleTargets(); |
393 | |
394 | /// Save lists of calls with MemProf metadata in each function, for faster |
395 | /// iteration. |
396 | MapVector<FuncTy *, std::vector<CallInfo>> FuncToCallsWithMetadata; |
397 | |
398 | /// Map from callsite node to the enclosing caller function. |
399 | std::map<const ContextNode *, const FuncTy *> NodeToCallingFunc; |
400 | |
401 | private: |
402 | using EdgeIter = typename std::vector<std::shared_ptr<ContextEdge>>::iterator; |
403 | |
404 | using CallContextInfo = std::tuple<CallTy, std::vector<uint64_t>, |
405 | const FuncTy *, DenseSet<uint32_t>>; |
406 | |
407 | /// Assigns the given Node to calls at or inlined into the location with |
408 | /// the Node's stack id, after post order traversing and processing its |
409 | /// caller nodes. Uses the call information recorded in the given |
410 | /// StackIdToMatchingCalls map, and creates new nodes for inlined sequences |
411 | /// as needed. Called by updateStackNodes which sets up the given |
412 | /// StackIdToMatchingCalls map. |
413 | void assignStackNodesPostOrder( |
414 | ContextNode *Node, DenseSet<const ContextNode *> &Visited, |
415 | DenseMap<uint64_t, std::vector<CallContextInfo>> &StackIdToMatchingCalls); |
416 | |
417 | /// Duplicates the given set of context ids, updating the provided |
418 | /// map from each original id with the newly generated context ids, |
419 | /// and returning the new duplicated id set. |
420 | DenseSet<uint32_t> duplicateContextIds( |
421 | const DenseSet<uint32_t> &StackSequenceContextIds, |
422 | DenseMap<uint32_t, DenseSet<uint32_t>> &OldToNewContextIds); |
423 | |
424 | /// Propagates all duplicated context ids across the graph. |
425 | void propagateDuplicateContextIds( |
426 | const DenseMap<uint32_t, DenseSet<uint32_t>> &OldToNewContextIds); |
427 | |
428 | /// Connect the NewNode to OrigNode's callees if TowardsCallee is true, |
429 | /// else to its callers. Also updates OrigNode's edges to remove any context |
430 | /// ids moved to the newly created edge. |
431 | void connectNewNode(ContextNode *NewNode, ContextNode *OrigNode, |
432 | bool TowardsCallee); |
433 | |
434 | /// Get the stack id corresponding to the given Id or Index (for IR this will |
435 | /// return itself, for a summary index this will return the id recorded in the |
436 | /// index for that stack id index value). |
437 | uint64_t getStackId(uint64_t IdOrIndex) const { |
438 | return static_cast<const DerivedCCG *>(this)->getStackId(IdOrIndex); |
439 | } |
440 | |
441 | /// Returns true if the given call targets the callee of the given edge, or if |
442 | /// we were able to identify the call chain through intermediate tail calls. |
443 | /// In the latter case new context nodes are added to the graph for the |
444 | /// identified tail calls, and their synthesized nodes are added to |
445 | /// TailCallToContextNodeMap. The EdgeIter is updated in either case to the |
446 | /// next element after the input position (either incremented or updated after |
447 | /// removing the old edge). |
448 | bool |
449 | calleesMatch(CallTy Call, EdgeIter &EI, |
450 | MapVector<CallInfo, ContextNode *> &TailCallToContextNodeMap); |
451 | |
452 | /// Returns true if the given call targets the given function, or if we were |
453 | /// able to identify the call chain through intermediate tail calls (in which |
454 | /// case FoundCalleeChain will be populated). |
455 | bool calleeMatchesFunc( |
456 | CallTy Call, const FuncTy *Func, const FuncTy *CallerFunc, |
457 | std::vector<std::pair<CallTy, FuncTy *>> &FoundCalleeChain) { |
458 | return static_cast<DerivedCCG *>(this)->calleeMatchesFunc( |
459 | Call, Func, CallerFunc, FoundCalleeChain); |
460 | } |
461 | |
462 | /// Get a list of nodes corresponding to the stack ids in the given |
463 | /// callsite's context. |
464 | std::vector<uint64_t> getStackIdsWithContextNodesForCall(CallTy Call) { |
465 | return static_cast<DerivedCCG *>(this)->getStackIdsWithContextNodesForCall( |
466 | Call); |
467 | } |
468 | |
469 | /// Get the last stack id in the context for callsite. |
470 | uint64_t getLastStackId(CallTy Call) { |
471 | return static_cast<DerivedCCG *>(this)->getLastStackId(Call); |
472 | } |
473 | |
474 | /// Update the allocation call to record type of allocated memory. |
475 | void updateAllocationCall(CallInfo &Call, AllocationType AllocType) { |
476 | AllocType == AllocationType::Cold ? AllocTypeCold++ : AllocTypeNotCold++; |
477 | static_cast<DerivedCCG *>(this)->updateAllocationCall(Call, AllocType); |
478 | } |
479 | |
480 | /// Update non-allocation call to invoke (possibly cloned) function |
481 | /// CalleeFunc. |
482 | void updateCall(CallInfo &CallerCall, FuncInfo CalleeFunc) { |
483 | static_cast<DerivedCCG *>(this)->updateCall(CallerCall, CalleeFunc); |
484 | } |
485 | |
486 | /// Clone the given function for the given callsite, recording mapping of all |
487 | /// of the functions tracked calls to their new versions in the CallMap. |
488 | /// Assigns new clones to clone number CloneNo. |
489 | FuncInfo cloneFunctionForCallsite( |
490 | FuncInfo &Func, CallInfo &Call, std::map<CallInfo, CallInfo> &CallMap, |
491 | std::vector<CallInfo> &CallsWithMetadataInFunc, unsigned CloneNo) { |
492 | return static_cast<DerivedCCG *>(this)->cloneFunctionForCallsite( |
493 | Func, Call, CallMap, CallsWithMetadataInFunc, CloneNo); |
494 | } |
495 | |
496 | /// Gets a label to use in the dot graph for the given call clone in the given |
497 | /// function. |
498 | std::string getLabel(const FuncTy *Func, const CallTy Call, |
499 | unsigned CloneNo) const { |
500 | return static_cast<const DerivedCCG *>(this)->getLabel(Func, Call, CloneNo); |
501 | } |
502 | |
503 | /// Helpers to find the node corresponding to the given call or stackid. |
504 | ContextNode *getNodeForInst(const CallInfo &C); |
505 | ContextNode *getNodeForAlloc(const CallInfo &C); |
506 | ContextNode *getNodeForStackId(uint64_t StackId); |
507 | |
508 | /// Removes the node information recorded for the given call. |
509 | void unsetNodeForInst(const CallInfo &C); |
510 | |
511 | /// Computes the alloc type corresponding to the given context ids, by |
512 | /// unioning their recorded alloc types. |
513 | uint8_t computeAllocType(DenseSet<uint32_t> &ContextIds); |
514 | |
515 | /// Returns the alloction type of the intersection of the contexts of two |
516 | /// nodes (based on their provided context id sets), optimized for the case |
517 | /// when Node1Ids is smaller than Node2Ids. |
518 | uint8_t intersectAllocTypesImpl(const DenseSet<uint32_t> &Node1Ids, |
519 | const DenseSet<uint32_t> &Node2Ids); |
520 | |
521 | /// Returns the alloction type of the intersection of the contexts of two |
522 | /// nodes (based on their provided context id sets). |
523 | uint8_t intersectAllocTypes(const DenseSet<uint32_t> &Node1Ids, |
524 | const DenseSet<uint32_t> &Node2Ids); |
525 | |
526 | /// Create a clone of Edge's callee and move Edge to that new callee node, |
527 | /// performing the necessary context id and allocation type updates. |
528 | /// If callee's caller edge iterator is supplied, it is updated when removing |
529 | /// the edge from that list. If ContextIdsToMove is non-empty, only that |
530 | /// subset of Edge's ids are moved to an edge to the new callee. |
531 | ContextNode * |
532 | moveEdgeToNewCalleeClone(const std::shared_ptr<ContextEdge> &Edge, |
533 | EdgeIter *CallerEdgeI = nullptr, |
534 | DenseSet<uint32_t> ContextIdsToMove = {}); |
535 | |
536 | /// Change the callee of Edge to existing callee clone NewCallee, performing |
537 | /// the necessary context id and allocation type updates. |
538 | /// If callee's caller edge iterator is supplied, it is updated when removing |
539 | /// the edge from that list. If ContextIdsToMove is non-empty, only that |
540 | /// subset of Edge's ids are moved to an edge to the new callee. |
541 | void moveEdgeToExistingCalleeClone(const std::shared_ptr<ContextEdge> &Edge, |
542 | ContextNode *NewCallee, |
543 | EdgeIter *CallerEdgeI = nullptr, |
544 | bool NewClone = false, |
545 | DenseSet<uint32_t> ContextIdsToMove = {}); |
546 | |
547 | /// Recursively perform cloning on the graph for the given Node and its |
548 | /// callers, in order to uniquely identify the allocation behavior of an |
549 | /// allocation given its context. The context ids of the allocation being |
550 | /// processed are given in AllocContextIds. |
551 | void identifyClones(ContextNode *Node, DenseSet<const ContextNode *> &Visited, |
552 | const DenseSet<uint32_t> &AllocContextIds); |
553 | |
554 | /// Map from each context ID to the AllocationType assigned to that context. |
555 | std::map<uint32_t, AllocationType> ContextIdToAllocationType; |
556 | |
557 | /// Identifies the context node created for a stack id when adding the MIB |
558 | /// contexts to the graph. This is used to locate the context nodes when |
559 | /// trying to assign the corresponding callsites with those stack ids to these |
560 | /// nodes. |
561 | std::map<uint64_t, ContextNode *> StackEntryIdToContextNodeMap; |
562 | |
563 | /// Maps to track the calls to their corresponding nodes in the graph. |
564 | MapVector<CallInfo, ContextNode *> AllocationCallToContextNodeMap; |
565 | MapVector<CallInfo, ContextNode *> NonAllocationCallToContextNodeMap; |
566 | |
567 | /// Owner of all ContextNode unique_ptrs. |
568 | std::vector<std::unique_ptr<ContextNode>> NodeOwner; |
569 | |
570 | /// Perform sanity checks on graph when requested. |
571 | void check() const; |
572 | |
573 | /// Keeps track of the last unique context id assigned. |
574 | unsigned int LastContextId = 0; |
575 | }; |
576 | |
577 | template <typename DerivedCCG, typename FuncTy, typename CallTy> |
578 | using ContextNode = |
579 | typename CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::ContextNode; |
580 | template <typename DerivedCCG, typename FuncTy, typename CallTy> |
581 | using ContextEdge = |
582 | typename CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::ContextEdge; |
583 | template <typename DerivedCCG, typename FuncTy, typename CallTy> |
584 | using FuncInfo = |
585 | typename CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::FuncInfo; |
586 | template <typename DerivedCCG, typename FuncTy, typename CallTy> |
587 | using CallInfo = |
588 | typename CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::CallInfo; |
589 | |
590 | /// CRTP derived class for graphs built from IR (regular LTO). |
591 | class ModuleCallsiteContextGraph |
592 | : public CallsiteContextGraph<ModuleCallsiteContextGraph, Function, |
593 | Instruction *> { |
594 | public: |
595 | ModuleCallsiteContextGraph( |
596 | Module &M, |
597 | llvm::function_ref<OptimizationRemarkEmitter &(Function *)> OREGetter); |
598 | |
599 | private: |
600 | friend CallsiteContextGraph<ModuleCallsiteContextGraph, Function, |
601 | Instruction *>; |
602 | |
603 | uint64_t getStackId(uint64_t IdOrIndex) const; |
604 | bool calleeMatchesFunc( |
605 | Instruction *Call, const Function *Func, const Function *CallerFunc, |
606 | std::vector<std::pair<Instruction *, Function *>> &FoundCalleeChain); |
607 | bool findProfiledCalleeThroughTailCalls( |
608 | const Function *ProfiledCallee, Value *CurCallee, unsigned Depth, |
609 | std::vector<std::pair<Instruction *, Function *>> &FoundCalleeChain, |
610 | bool &FoundMultipleCalleeChains); |
611 | uint64_t getLastStackId(Instruction *Call); |
612 | std::vector<uint64_t> getStackIdsWithContextNodesForCall(Instruction *Call); |
613 | void updateAllocationCall(CallInfo &Call, AllocationType AllocType); |
614 | void updateCall(CallInfo &CallerCall, FuncInfo CalleeFunc); |
615 | CallsiteContextGraph<ModuleCallsiteContextGraph, Function, |
616 | Instruction *>::FuncInfo |
617 | cloneFunctionForCallsite(FuncInfo &Func, CallInfo &Call, |
618 | std::map<CallInfo, CallInfo> &CallMap, |
619 | std::vector<CallInfo> &CallsWithMetadataInFunc, |
620 | unsigned CloneNo); |
621 | std::string getLabel(const Function *Func, const Instruction *Call, |
622 | unsigned CloneNo) const; |
623 | |
624 | const Module &Mod; |
625 | llvm::function_ref<OptimizationRemarkEmitter &(Function *)> OREGetter; |
626 | }; |
627 | |
628 | /// Represents a call in the summary index graph, which can either be an |
629 | /// allocation or an interior callsite node in an allocation's context. |
630 | /// Holds a pointer to the corresponding data structure in the index. |
631 | struct IndexCall : public PointerUnion<CallsiteInfo *, AllocInfo *> { |
632 | IndexCall() : PointerUnion() {} |
633 | IndexCall(std::nullptr_t) : IndexCall() {} |
634 | IndexCall(CallsiteInfo *StackNode) : PointerUnion(StackNode) {} |
635 | IndexCall(AllocInfo *AllocNode) : PointerUnion(AllocNode) {} |
636 | IndexCall(PointerUnion PT) : PointerUnion(PT) {} |
637 | |
638 | IndexCall *operator->() { return this; } |
639 | |
640 | PointerUnion<CallsiteInfo *, AllocInfo *> getBase() const { return *this; } |
641 | |
642 | void print(raw_ostream &OS) const { |
643 | if (auto *AI = llvm::dyn_cast_if_present<AllocInfo *>(Val: getBase())) { |
644 | OS << *AI; |
645 | } else { |
646 | auto *CI = llvm::dyn_cast_if_present<CallsiteInfo *>(Val: getBase()); |
647 | assert(CI); |
648 | OS << *CI; |
649 | } |
650 | } |
651 | }; |
652 | |
653 | /// CRTP derived class for graphs built from summary index (ThinLTO). |
654 | class IndexCallsiteContextGraph |
655 | : public CallsiteContextGraph<IndexCallsiteContextGraph, FunctionSummary, |
656 | IndexCall> { |
657 | public: |
658 | IndexCallsiteContextGraph( |
659 | ModuleSummaryIndex &Index, |
660 | llvm::function_ref<bool(GlobalValue::GUID, const GlobalValueSummary *)> |
661 | isPrevailing); |
662 | |
663 | ~IndexCallsiteContextGraph() { |
664 | // Now that we are done with the graph it is safe to add the new |
665 | // CallsiteInfo structs to the function summary vectors. The graph nodes |
666 | // point into locations within these vectors, so we don't want to add them |
667 | // any earlier. |
668 | for (auto &I : FunctionCalleesToSynthesizedCallsiteInfos) { |
669 | auto *FS = I.first; |
670 | for (auto &Callsite : I.second) |
671 | FS->addCallsite(Callsite&: *Callsite.second); |
672 | } |
673 | } |
674 | |
675 | private: |
676 | friend CallsiteContextGraph<IndexCallsiteContextGraph, FunctionSummary, |
677 | IndexCall>; |
678 | |
679 | uint64_t getStackId(uint64_t IdOrIndex) const; |
680 | bool calleeMatchesFunc( |
681 | IndexCall &Call, const FunctionSummary *Func, |
682 | const FunctionSummary *CallerFunc, |
683 | std::vector<std::pair<IndexCall, FunctionSummary *>> &FoundCalleeChain); |
684 | bool findProfiledCalleeThroughTailCalls( |
685 | ValueInfo ProfiledCallee, ValueInfo CurCallee, unsigned Depth, |
686 | std::vector<std::pair<IndexCall, FunctionSummary *>> &FoundCalleeChain, |
687 | bool &FoundMultipleCalleeChains); |
688 | uint64_t getLastStackId(IndexCall &Call); |
689 | std::vector<uint64_t> getStackIdsWithContextNodesForCall(IndexCall &Call); |
690 | void updateAllocationCall(CallInfo &Call, AllocationType AllocType); |
691 | void updateCall(CallInfo &CallerCall, FuncInfo CalleeFunc); |
692 | CallsiteContextGraph<IndexCallsiteContextGraph, FunctionSummary, |
693 | IndexCall>::FuncInfo |
694 | cloneFunctionForCallsite(FuncInfo &Func, CallInfo &Call, |
695 | std::map<CallInfo, CallInfo> &CallMap, |
696 | std::vector<CallInfo> &CallsWithMetadataInFunc, |
697 | unsigned CloneNo); |
698 | std::string getLabel(const FunctionSummary *Func, const IndexCall &Call, |
699 | unsigned CloneNo) const; |
700 | |
701 | // Saves mapping from function summaries containing memprof records back to |
702 | // its VI, for use in checking and debugging. |
703 | std::map<const FunctionSummary *, ValueInfo> FSToVIMap; |
704 | |
705 | const ModuleSummaryIndex &Index; |
706 | llvm::function_ref<bool(GlobalValue::GUID, const GlobalValueSummary *)> |
707 | isPrevailing; |
708 | |
709 | // Saves/owns the callsite info structures synthesized for missing tail call |
710 | // frames that we discover while building the graph. |
711 | // It maps from the summary of the function making the tail call, to a map |
712 | // of callee ValueInfo to corresponding synthesized callsite info. |
713 | std::unordered_map<FunctionSummary *, |
714 | std::map<ValueInfo, std::unique_ptr<CallsiteInfo>>> |
715 | FunctionCalleesToSynthesizedCallsiteInfos; |
716 | }; |
717 | } // namespace |
718 | |
719 | namespace llvm { |
720 | template <> |
721 | struct DenseMapInfo<typename CallsiteContextGraph< |
722 | ModuleCallsiteContextGraph, Function, Instruction *>::CallInfo> |
723 | : public DenseMapInfo<std::pair<Instruction *, unsigned>> {}; |
724 | template <> |
725 | struct DenseMapInfo<typename CallsiteContextGraph< |
726 | IndexCallsiteContextGraph, FunctionSummary, IndexCall>::CallInfo> |
727 | : public DenseMapInfo<std::pair<IndexCall, unsigned>> {}; |
728 | template <> |
729 | struct DenseMapInfo<IndexCall> |
730 | : public DenseMapInfo<PointerUnion<CallsiteInfo *, AllocInfo *>> {}; |
731 | } // end namespace llvm |
732 | |
733 | namespace { |
734 | |
735 | struct FieldSeparator { |
736 | bool Skip = true; |
737 | const char *Sep; |
738 | |
739 | FieldSeparator(const char *Sep = ", " ) : Sep(Sep) {} |
740 | }; |
741 | |
742 | raw_ostream &operator<<(raw_ostream &OS, FieldSeparator &FS) { |
743 | if (FS.Skip) { |
744 | FS.Skip = false; |
745 | return OS; |
746 | } |
747 | return OS << FS.Sep; |
748 | } |
749 | |
750 | // Map the uint8_t alloc types (which may contain NotCold|Cold) to the alloc |
751 | // type we should actually use on the corresponding allocation. |
752 | // If we can't clone a node that has NotCold+Cold alloc type, we will fall |
753 | // back to using NotCold. So don't bother cloning to distinguish NotCold+Cold |
754 | // from NotCold. |
755 | AllocationType allocTypeToUse(uint8_t AllocTypes) { |
756 | assert(AllocTypes != (uint8_t)AllocationType::None); |
757 | if (AllocTypes == |
758 | ((uint8_t)AllocationType::NotCold | (uint8_t)AllocationType::Cold)) |
759 | return AllocationType::NotCold; |
760 | else |
761 | return (AllocationType)AllocTypes; |
762 | } |
763 | |
764 | // Helper to check if the alloc types for all edges recorded in the |
765 | // InAllocTypes vector match the alloc types for all edges in the Edges |
766 | // vector. |
767 | template <typename DerivedCCG, typename FuncTy, typename CallTy> |
768 | bool allocTypesMatch( |
769 | const std::vector<uint8_t> &InAllocTypes, |
770 | const std::vector<std::shared_ptr<ContextEdge<DerivedCCG, FuncTy, CallTy>>> |
771 | &Edges) { |
772 | return std::equal( |
773 | InAllocTypes.begin(), InAllocTypes.end(), Edges.begin(), |
774 | [](const uint8_t &l, |
775 | const std::shared_ptr<ContextEdge<DerivedCCG, FuncTy, CallTy>> &r) { |
776 | // Can share if one of the edges is None type - don't |
777 | // care about the type along that edge as it doesn't |
778 | // exist for those context ids. |
779 | if (l == (uint8_t)AllocationType::None || |
780 | r->AllocTypes == (uint8_t)AllocationType::None) |
781 | return true; |
782 | return allocTypeToUse(AllocTypes: l) == allocTypeToUse(r->AllocTypes); |
783 | }); |
784 | } |
785 | |
786 | } // end anonymous namespace |
787 | |
788 | template <typename DerivedCCG, typename FuncTy, typename CallTy> |
789 | typename CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::ContextNode * |
790 | CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::getNodeForInst( |
791 | const CallInfo &C) { |
792 | ContextNode *Node = getNodeForAlloc(C); |
793 | if (Node) |
794 | return Node; |
795 | |
796 | return NonAllocationCallToContextNodeMap.lookup(C); |
797 | } |
798 | |
799 | template <typename DerivedCCG, typename FuncTy, typename CallTy> |
800 | typename CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::ContextNode * |
801 | CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::getNodeForAlloc( |
802 | const CallInfo &C) { |
803 | return AllocationCallToContextNodeMap.lookup(C); |
804 | } |
805 | |
806 | template <typename DerivedCCG, typename FuncTy, typename CallTy> |
807 | typename CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::ContextNode * |
808 | CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::getNodeForStackId( |
809 | uint64_t StackId) { |
810 | auto StackEntryNode = StackEntryIdToContextNodeMap.find(StackId); |
811 | if (StackEntryNode != StackEntryIdToContextNodeMap.end()) |
812 | return StackEntryNode->second; |
813 | return nullptr; |
814 | } |
815 | |
816 | template <typename DerivedCCG, typename FuncTy, typename CallTy> |
817 | void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::unsetNodeForInst( |
818 | const CallInfo &C) { |
819 | AllocationCallToContextNodeMap.erase(C) || |
820 | NonAllocationCallToContextNodeMap.erase(C); |
821 | assert(!AllocationCallToContextNodeMap.count(C) && |
822 | !NonAllocationCallToContextNodeMap.count(C)); |
823 | } |
824 | |
825 | template <typename DerivedCCG, typename FuncTy, typename CallTy> |
826 | void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::ContextNode:: |
827 | addOrUpdateCallerEdge(ContextNode *Caller, AllocationType AllocType, |
828 | unsigned int ContextId) { |
829 | for (auto &Edge : CallerEdges) { |
830 | if (Edge->Caller == Caller) { |
831 | Edge->AllocTypes |= (uint8_t)AllocType; |
832 | Edge->getContextIds().insert(ContextId); |
833 | return; |
834 | } |
835 | } |
836 | std::shared_ptr<ContextEdge> Edge = std::make_shared<ContextEdge>( |
837 | this, Caller, (uint8_t)AllocType, DenseSet<uint32_t>({ContextId})); |
838 | CallerEdges.push_back(Edge); |
839 | Caller->CalleeEdges.push_back(Edge); |
840 | } |
841 | |
842 | template <typename DerivedCCG, typename FuncTy, typename CallTy> |
843 | void CallsiteContextGraph< |
844 | DerivedCCG, FuncTy, CallTy>::removeNoneTypeCalleeEdges(ContextNode *Node) { |
845 | for (auto EI = Node->CalleeEdges.begin(); EI != Node->CalleeEdges.end();) { |
846 | auto Edge = *EI; |
847 | if (Edge->AllocTypes == (uint8_t)AllocationType::None) { |
848 | assert(Edge->ContextIds.empty()); |
849 | Edge->Callee->eraseCallerEdge(Edge.get()); |
850 | EI = Node->CalleeEdges.erase(EI); |
851 | } else |
852 | ++EI; |
853 | } |
854 | } |
855 | |
856 | template <typename DerivedCCG, typename FuncTy, typename CallTy> |
857 | typename CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::ContextEdge * |
858 | CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::ContextNode:: |
859 | findEdgeFromCallee(const ContextNode *Callee) { |
860 | for (const auto &Edge : CalleeEdges) |
861 | if (Edge->Callee == Callee) |
862 | return Edge.get(); |
863 | return nullptr; |
864 | } |
865 | |
866 | template <typename DerivedCCG, typename FuncTy, typename CallTy> |
867 | typename CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::ContextEdge * |
868 | CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::ContextNode:: |
869 | findEdgeFromCaller(const ContextNode *Caller) { |
870 | for (const auto &Edge : CallerEdges) |
871 | if (Edge->Caller == Caller) |
872 | return Edge.get(); |
873 | return nullptr; |
874 | } |
875 | |
876 | template <typename DerivedCCG, typename FuncTy, typename CallTy> |
877 | void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::ContextNode:: |
878 | eraseCalleeEdge(const ContextEdge *Edge) { |
879 | auto EI = llvm::find_if( |
880 | CalleeEdges, [Edge](const std::shared_ptr<ContextEdge> &CalleeEdge) { |
881 | return CalleeEdge.get() == Edge; |
882 | }); |
883 | assert(EI != CalleeEdges.end()); |
884 | CalleeEdges.erase(EI); |
885 | } |
886 | |
887 | template <typename DerivedCCG, typename FuncTy, typename CallTy> |
888 | void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::ContextNode:: |
889 | eraseCallerEdge(const ContextEdge *Edge) { |
890 | auto EI = llvm::find_if( |
891 | CallerEdges, [Edge](const std::shared_ptr<ContextEdge> &CallerEdge) { |
892 | return CallerEdge.get() == Edge; |
893 | }); |
894 | assert(EI != CallerEdges.end()); |
895 | CallerEdges.erase(EI); |
896 | } |
897 | |
898 | template <typename DerivedCCG, typename FuncTy, typename CallTy> |
899 | uint8_t CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::computeAllocType( |
900 | DenseSet<uint32_t> &ContextIds) { |
901 | uint8_t BothTypes = |
902 | (uint8_t)AllocationType::Cold | (uint8_t)AllocationType::NotCold; |
903 | uint8_t AllocType = (uint8_t)AllocationType::None; |
904 | for (auto Id : ContextIds) { |
905 | AllocType |= (uint8_t)ContextIdToAllocationType[Id]; |
906 | // Bail early if alloc type reached both, no further refinement. |
907 | if (AllocType == BothTypes) |
908 | return AllocType; |
909 | } |
910 | return AllocType; |
911 | } |
912 | |
913 | template <typename DerivedCCG, typename FuncTy, typename CallTy> |
914 | uint8_t |
915 | CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::intersectAllocTypesImpl( |
916 | const DenseSet<uint32_t> &Node1Ids, const DenseSet<uint32_t> &Node2Ids) { |
917 | uint8_t BothTypes = |
918 | (uint8_t)AllocationType::Cold | (uint8_t)AllocationType::NotCold; |
919 | uint8_t AllocType = (uint8_t)AllocationType::None; |
920 | for (auto Id : Node1Ids) { |
921 | if (!Node2Ids.count(V: Id)) |
922 | continue; |
923 | AllocType |= (uint8_t)ContextIdToAllocationType[Id]; |
924 | // Bail early if alloc type reached both, no further refinement. |
925 | if (AllocType == BothTypes) |
926 | return AllocType; |
927 | } |
928 | return AllocType; |
929 | } |
930 | |
931 | template <typename DerivedCCG, typename FuncTy, typename CallTy> |
932 | uint8_t CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::intersectAllocTypes( |
933 | const DenseSet<uint32_t> &Node1Ids, const DenseSet<uint32_t> &Node2Ids) { |
934 | if (Node1Ids.size() < Node2Ids.size()) |
935 | return intersectAllocTypesImpl(Node1Ids, Node2Ids); |
936 | else |
937 | return intersectAllocTypesImpl(Node1Ids: Node2Ids, Node2Ids: Node1Ids); |
938 | } |
939 | |
940 | template <typename DerivedCCG, typename FuncTy, typename CallTy> |
941 | typename CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::ContextNode * |
942 | CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::addAllocNode( |
943 | CallInfo Call, const FuncTy *F) { |
944 | assert(!getNodeForAlloc(Call)); |
945 | NodeOwner.push_back( |
946 | std::make_unique<ContextNode>(/*IsAllocation=*/true, Call)); |
947 | ContextNode *AllocNode = NodeOwner.back().get(); |
948 | AllocationCallToContextNodeMap[Call] = AllocNode; |
949 | NodeToCallingFunc[AllocNode] = F; |
950 | // Use LastContextId as a uniq id for MIB allocation nodes. |
951 | AllocNode->OrigStackOrAllocId = LastContextId; |
952 | // Alloc type should be updated as we add in the MIBs. We should assert |
953 | // afterwards that it is not still None. |
954 | AllocNode->AllocTypes = (uint8_t)AllocationType::None; |
955 | |
956 | return AllocNode; |
957 | } |
958 | |
959 | template <typename DerivedCCG, typename FuncTy, typename CallTy> |
960 | template <class NodeT, class IteratorT> |
961 | void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::addStackNodesForMIB( |
962 | ContextNode *AllocNode, CallStack<NodeT, IteratorT> &StackContext, |
963 | CallStack<NodeT, IteratorT> &CallsiteContext, AllocationType AllocType) { |
964 | // Treating the hot alloc type as NotCold before the disambiguation for "hot" |
965 | // is done. |
966 | if (AllocType == AllocationType::Hot) |
967 | AllocType = AllocationType::NotCold; |
968 | |
969 | ContextIdToAllocationType[++LastContextId] = AllocType; |
970 | |
971 | // Update alloc type and context ids for this MIB. |
972 | AllocNode->AllocTypes |= (uint8_t)AllocType; |
973 | AllocNode->ContextIds.insert(LastContextId); |
974 | |
975 | // Now add or update nodes for each stack id in alloc's context. |
976 | // Later when processing the stack ids on non-alloc callsites we will adjust |
977 | // for any inlining in the context. |
978 | ContextNode *PrevNode = AllocNode; |
979 | // Look for recursion (direct recursion should have been collapsed by |
980 | // module summary analysis, here we should just be detecting mutual |
981 | // recursion). Mark these nodes so we don't try to clone. |
982 | SmallSet<uint64_t, 8> StackIdSet; |
983 | // Skip any on the allocation call (inlining). |
984 | for (auto ContextIter = StackContext.beginAfterSharedPrefix(CallsiteContext); |
985 | ContextIter != StackContext.end(); ++ContextIter) { |
986 | auto StackId = getStackId(IdOrIndex: *ContextIter); |
987 | ContextNode *StackNode = getNodeForStackId(StackId); |
988 | if (!StackNode) { |
989 | NodeOwner.push_back( |
990 | std::make_unique<ContextNode>(/*IsAllocation=*/false)); |
991 | StackNode = NodeOwner.back().get(); |
992 | StackEntryIdToContextNodeMap[StackId] = StackNode; |
993 | StackNode->OrigStackOrAllocId = StackId; |
994 | } |
995 | auto Ins = StackIdSet.insert(StackId); |
996 | if (!Ins.second) |
997 | StackNode->Recursive = true; |
998 | StackNode->ContextIds.insert(LastContextId); |
999 | StackNode->AllocTypes |= (uint8_t)AllocType; |
1000 | PrevNode->addOrUpdateCallerEdge(StackNode, AllocType, LastContextId); |
1001 | PrevNode = StackNode; |
1002 | } |
1003 | } |
1004 | |
1005 | template <typename DerivedCCG, typename FuncTy, typename CallTy> |
1006 | DenseSet<uint32_t> |
1007 | CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::duplicateContextIds( |
1008 | const DenseSet<uint32_t> &StackSequenceContextIds, |
1009 | DenseMap<uint32_t, DenseSet<uint32_t>> &OldToNewContextIds) { |
1010 | DenseSet<uint32_t> NewContextIds; |
1011 | for (auto OldId : StackSequenceContextIds) { |
1012 | NewContextIds.insert(V: ++LastContextId); |
1013 | OldToNewContextIds[OldId].insert(V: LastContextId); |
1014 | assert(ContextIdToAllocationType.count(OldId)); |
1015 | // The new context has the same allocation type as original. |
1016 | ContextIdToAllocationType[LastContextId] = ContextIdToAllocationType[OldId]; |
1017 | } |
1018 | return NewContextIds; |
1019 | } |
1020 | |
1021 | template <typename DerivedCCG, typename FuncTy, typename CallTy> |
1022 | void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>:: |
1023 | propagateDuplicateContextIds( |
1024 | const DenseMap<uint32_t, DenseSet<uint32_t>> &OldToNewContextIds) { |
1025 | // Build a set of duplicated context ids corresponding to the input id set. |
1026 | auto GetNewIds = [&OldToNewContextIds](const DenseSet<uint32_t> &ContextIds) { |
1027 | DenseSet<uint32_t> NewIds; |
1028 | for (auto Id : ContextIds) |
1029 | if (auto NewId = OldToNewContextIds.find(Val: Id); |
1030 | NewId != OldToNewContextIds.end()) |
1031 | NewIds.insert(I: NewId->second.begin(), E: NewId->second.end()); |
1032 | return NewIds; |
1033 | }; |
1034 | |
1035 | // Recursively update context ids sets along caller edges. |
1036 | auto UpdateCallers = [&](ContextNode *Node, |
1037 | DenseSet<const ContextEdge *> &Visited, |
1038 | auto &&UpdateCallers) -> void { |
1039 | for (const auto &Edge : Node->CallerEdges) { |
1040 | auto Inserted = Visited.insert(Edge.get()); |
1041 | if (!Inserted.second) |
1042 | continue; |
1043 | ContextNode *NextNode = Edge->Caller; |
1044 | DenseSet<uint32_t> NewIdsToAdd = GetNewIds(Edge->getContextIds()); |
1045 | // Only need to recursively iterate to NextNode via this caller edge if |
1046 | // it resulted in any added ids to NextNode. |
1047 | if (!NewIdsToAdd.empty()) { |
1048 | Edge->getContextIds().insert(NewIdsToAdd.begin(), NewIdsToAdd.end()); |
1049 | NextNode->ContextIds.insert(NewIdsToAdd.begin(), NewIdsToAdd.end()); |
1050 | UpdateCallers(NextNode, Visited, UpdateCallers); |
1051 | } |
1052 | } |
1053 | }; |
1054 | |
1055 | DenseSet<const ContextEdge *> Visited; |
1056 | for (auto &Entry : AllocationCallToContextNodeMap) { |
1057 | auto *Node = Entry.second; |
1058 | // Update ids on the allocation nodes before calling the recursive |
1059 | // update along caller edges, since this simplifies the logic during |
1060 | // that traversal. |
1061 | DenseSet<uint32_t> NewIdsToAdd = GetNewIds(Node->ContextIds); |
1062 | Node->ContextIds.insert(NewIdsToAdd.begin(), NewIdsToAdd.end()); |
1063 | UpdateCallers(Node, Visited, UpdateCallers); |
1064 | } |
1065 | } |
1066 | |
1067 | template <typename DerivedCCG, typename FuncTy, typename CallTy> |
1068 | void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::connectNewNode( |
1069 | ContextNode *NewNode, ContextNode *OrigNode, bool TowardsCallee) { |
1070 | // Make a copy of the context ids, since this will be adjusted below as they |
1071 | // are moved. |
1072 | DenseSet<uint32_t> RemainingContextIds = NewNode->ContextIds; |
1073 | auto &OrigEdges = |
1074 | TowardsCallee ? OrigNode->CalleeEdges : OrigNode->CallerEdges; |
1075 | // Increment iterator in loop so that we can remove edges as needed. |
1076 | for (auto EI = OrigEdges.begin(); EI != OrigEdges.end();) { |
1077 | auto Edge = *EI; |
1078 | // Remove any matching context ids from Edge, return set that were found and |
1079 | // removed, these are the new edge's context ids. Also update the remaining |
1080 | // (not found ids). |
1081 | DenseSet<uint32_t> NewEdgeContextIds, NotFoundContextIds; |
1082 | set_subtract(Edge->getContextIds(), RemainingContextIds, NewEdgeContextIds, |
1083 | NotFoundContextIds); |
1084 | RemainingContextIds.swap(RHS&: NotFoundContextIds); |
1085 | // If no matching context ids for this edge, skip it. |
1086 | if (NewEdgeContextIds.empty()) { |
1087 | ++EI; |
1088 | continue; |
1089 | } |
1090 | if (TowardsCallee) { |
1091 | auto NewEdge = std::make_shared<ContextEdge>( |
1092 | Edge->Callee, NewNode, computeAllocType(ContextIds&: NewEdgeContextIds), |
1093 | NewEdgeContextIds); |
1094 | NewNode->CalleeEdges.push_back(NewEdge); |
1095 | NewEdge->Callee->CallerEdges.push_back(NewEdge); |
1096 | } else { |
1097 | auto NewEdge = std::make_shared<ContextEdge>( |
1098 | NewNode, Edge->Caller, computeAllocType(ContextIds&: NewEdgeContextIds), |
1099 | NewEdgeContextIds); |
1100 | NewNode->CallerEdges.push_back(NewEdge); |
1101 | NewEdge->Caller->CalleeEdges.push_back(NewEdge); |
1102 | } |
1103 | // Remove old edge if context ids empty. |
1104 | if (Edge->getContextIds().empty()) { |
1105 | if (TowardsCallee) { |
1106 | Edge->Callee->eraseCallerEdge(Edge.get()); |
1107 | EI = OrigNode->CalleeEdges.erase(EI); |
1108 | } else { |
1109 | Edge->Caller->eraseCalleeEdge(Edge.get()); |
1110 | EI = OrigNode->CallerEdges.erase(EI); |
1111 | } |
1112 | continue; |
1113 | } |
1114 | ++EI; |
1115 | } |
1116 | } |
1117 | |
1118 | template <typename DerivedCCG, typename FuncTy, typename CallTy> |
1119 | void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>:: |
1120 | assignStackNodesPostOrder(ContextNode *Node, |
1121 | DenseSet<const ContextNode *> &Visited, |
1122 | DenseMap<uint64_t, std::vector<CallContextInfo>> |
1123 | &StackIdToMatchingCalls) { |
1124 | auto Inserted = Visited.insert(Node); |
1125 | if (!Inserted.second) |
1126 | return; |
1127 | // Post order traversal. Iterate over a copy since we may add nodes and |
1128 | // therefore new callers during the recursive call, invalidating any |
1129 | // iterator over the original edge vector. We don't need to process these |
1130 | // new nodes as they were already processed on creation. |
1131 | auto CallerEdges = Node->CallerEdges; |
1132 | for (auto &Edge : CallerEdges) { |
1133 | // Skip any that have been removed during the recursion. |
1134 | if (!Edge) |
1135 | continue; |
1136 | assignStackNodesPostOrder(Node: Edge->Caller, Visited, StackIdToMatchingCalls); |
1137 | } |
1138 | |
1139 | // If this node's stack id is in the map, update the graph to contain new |
1140 | // nodes representing any inlining at interior callsites. Note we move the |
1141 | // associated context ids over to the new nodes. |
1142 | |
1143 | // Ignore this node if it is for an allocation or we didn't record any |
1144 | // stack id lists ending at it. |
1145 | if (Node->IsAllocation || |
1146 | !StackIdToMatchingCalls.count(Node->OrigStackOrAllocId)) |
1147 | return; |
1148 | |
1149 | auto &Calls = StackIdToMatchingCalls[Node->OrigStackOrAllocId]; |
1150 | // Handle the simple case first. A single call with a single stack id. |
1151 | // In this case there is no need to create any new context nodes, simply |
1152 | // assign the context node for stack id to this Call. |
1153 | if (Calls.size() == 1) { |
1154 | auto &[Call, Ids, Func, SavedContextIds] = Calls[0]; |
1155 | if (Ids.size() == 1) { |
1156 | assert(SavedContextIds.empty()); |
1157 | // It should be this Node |
1158 | assert(Node == getNodeForStackId(Ids[0])); |
1159 | if (Node->Recursive) |
1160 | return; |
1161 | Node->setCall(Call); |
1162 | NonAllocationCallToContextNodeMap[Call] = Node; |
1163 | NodeToCallingFunc[Node] = Func; |
1164 | return; |
1165 | } |
1166 | } |
1167 | |
1168 | // Find the node for the last stack id, which should be the same |
1169 | // across all calls recorded for this id, and is this node's id. |
1170 | uint64_t LastId = Node->OrigStackOrAllocId; |
1171 | ContextNode *LastNode = getNodeForStackId(StackId: LastId); |
1172 | // We should only have kept stack ids that had nodes. |
1173 | assert(LastNode); |
1174 | |
1175 | for (unsigned I = 0; I < Calls.size(); I++) { |
1176 | auto &[Call, Ids, Func, SavedContextIds] = Calls[I]; |
1177 | // Skip any for which we didn't assign any ids, these don't get a node in |
1178 | // the graph. |
1179 | if (SavedContextIds.empty()) |
1180 | continue; |
1181 | |
1182 | assert(LastId == Ids.back()); |
1183 | |
1184 | ContextNode *FirstNode = getNodeForStackId(StackId: Ids[0]); |
1185 | assert(FirstNode); |
1186 | |
1187 | // Recompute the context ids for this stack id sequence (the |
1188 | // intersection of the context ids of the corresponding nodes). |
1189 | // Start with the ids we saved in the map for this call, which could be |
1190 | // duplicated context ids. We have to recompute as we might have overlap |
1191 | // overlap between the saved context ids for different last nodes, and |
1192 | // removed them already during the post order traversal. |
1193 | set_intersect(SavedContextIds, FirstNode->ContextIds); |
1194 | ContextNode *PrevNode = nullptr; |
1195 | for (auto Id : Ids) { |
1196 | ContextNode *CurNode = getNodeForStackId(StackId: Id); |
1197 | // We should only have kept stack ids that had nodes and weren't |
1198 | // recursive. |
1199 | assert(CurNode); |
1200 | assert(!CurNode->Recursive); |
1201 | if (!PrevNode) { |
1202 | PrevNode = CurNode; |
1203 | continue; |
1204 | } |
1205 | auto *Edge = CurNode->findEdgeFromCallee(PrevNode); |
1206 | if (!Edge) { |
1207 | SavedContextIds.clear(); |
1208 | break; |
1209 | } |
1210 | PrevNode = CurNode; |
1211 | set_intersect(SavedContextIds, Edge->getContextIds()); |
1212 | |
1213 | // If we now have no context ids for clone, skip this call. |
1214 | if (SavedContextIds.empty()) |
1215 | break; |
1216 | } |
1217 | if (SavedContextIds.empty()) |
1218 | continue; |
1219 | |
1220 | // Create new context node. |
1221 | NodeOwner.push_back( |
1222 | std::make_unique<ContextNode>(/*IsAllocation=*/false, Call)); |
1223 | ContextNode *NewNode = NodeOwner.back().get(); |
1224 | NodeToCallingFunc[NewNode] = Func; |
1225 | NonAllocationCallToContextNodeMap[Call] = NewNode; |
1226 | NewNode->ContextIds = SavedContextIds; |
1227 | NewNode->AllocTypes = computeAllocType(ContextIds&: NewNode->ContextIds); |
1228 | |
1229 | // Connect to callees of innermost stack frame in inlined call chain. |
1230 | // This updates context ids for FirstNode's callee's to reflect those |
1231 | // moved to NewNode. |
1232 | connectNewNode(NewNode, OrigNode: FirstNode, /*TowardsCallee=*/true); |
1233 | |
1234 | // Connect to callers of outermost stack frame in inlined call chain. |
1235 | // This updates context ids for FirstNode's caller's to reflect those |
1236 | // moved to NewNode. |
1237 | connectNewNode(NewNode, OrigNode: LastNode, /*TowardsCallee=*/false); |
1238 | |
1239 | // Now we need to remove context ids from edges/nodes between First and |
1240 | // Last Node. |
1241 | PrevNode = nullptr; |
1242 | for (auto Id : Ids) { |
1243 | ContextNode *CurNode = getNodeForStackId(StackId: Id); |
1244 | // We should only have kept stack ids that had nodes. |
1245 | assert(CurNode); |
1246 | |
1247 | // Remove the context ids moved to NewNode from CurNode, and the |
1248 | // edge from the prior node. |
1249 | set_subtract(CurNode->ContextIds, NewNode->ContextIds); |
1250 | if (PrevNode) { |
1251 | auto *PrevEdge = CurNode->findEdgeFromCallee(PrevNode); |
1252 | assert(PrevEdge); |
1253 | set_subtract(PrevEdge->getContextIds(), NewNode->ContextIds); |
1254 | if (PrevEdge->getContextIds().empty()) { |
1255 | PrevNode->eraseCallerEdge(PrevEdge); |
1256 | CurNode->eraseCalleeEdge(PrevEdge); |
1257 | } |
1258 | } |
1259 | PrevNode = CurNode; |
1260 | } |
1261 | } |
1262 | } |
1263 | |
1264 | template <typename DerivedCCG, typename FuncTy, typename CallTy> |
1265 | void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::updateStackNodes() { |
1266 | // Map of stack id to all calls with that as the last (outermost caller) |
1267 | // callsite id that has a context node (some might not due to pruning |
1268 | // performed during matching of the allocation profile contexts). |
1269 | // The CallContextInfo contains the Call and a list of its stack ids with |
1270 | // ContextNodes, the function containing Call, and the set of context ids |
1271 | // the analysis will eventually identify for use in any new node created |
1272 | // for that callsite. |
1273 | DenseMap<uint64_t, std::vector<CallContextInfo>> StackIdToMatchingCalls; |
1274 | for (auto &[Func, CallsWithMetadata] : FuncToCallsWithMetadata) { |
1275 | for (auto &Call : CallsWithMetadata) { |
1276 | // Ignore allocations, already handled. |
1277 | if (AllocationCallToContextNodeMap.count(Call)) |
1278 | continue; |
1279 | auto StackIdsWithContextNodes = |
1280 | getStackIdsWithContextNodesForCall(Call: Call.call()); |
1281 | // If there were no nodes created for MIBs on allocs (maybe this was in |
1282 | // the unambiguous part of the MIB stack that was pruned), ignore. |
1283 | if (StackIdsWithContextNodes.empty()) |
1284 | continue; |
1285 | // Otherwise, record this Call along with the list of ids for the last |
1286 | // (outermost caller) stack id with a node. |
1287 | StackIdToMatchingCalls[StackIdsWithContextNodes.back()].push_back( |
1288 | {Call.call(), StackIdsWithContextNodes, Func, {}}); |
1289 | } |
1290 | } |
1291 | |
1292 | // First make a pass through all stack ids that correspond to a call, |
1293 | // as identified in the above loop. Compute the context ids corresponding to |
1294 | // each of these calls when they correspond to multiple stack ids due to |
1295 | // due to inlining. Perform any duplication of context ids required when |
1296 | // there is more than one call with the same stack ids. Their (possibly newly |
1297 | // duplicated) context ids are saved in the StackIdToMatchingCalls map. |
1298 | DenseMap<uint32_t, DenseSet<uint32_t>> OldToNewContextIds; |
1299 | for (auto &It : StackIdToMatchingCalls) { |
1300 | auto &Calls = It.getSecond(); |
1301 | // Skip single calls with a single stack id. These don't need a new node. |
1302 | if (Calls.size() == 1) { |
1303 | auto &Ids = std::get<1>(Calls[0]); |
1304 | if (Ids.size() == 1) |
1305 | continue; |
1306 | } |
1307 | // In order to do the best and maximal matching of inlined calls to context |
1308 | // node sequences we will sort the vectors of stack ids in descending order |
1309 | // of length, and within each length, lexicographically by stack id. The |
1310 | // latter is so that we can specially handle calls that have identical stack |
1311 | // id sequences (either due to cloning or artificially because of the MIB |
1312 | // context pruning). |
1313 | std::stable_sort(Calls.begin(), Calls.end(), |
1314 | [](const CallContextInfo &A, const CallContextInfo &B) { |
1315 | auto &IdsA = std::get<1>(A); |
1316 | auto &IdsB = std::get<1>(B); |
1317 | return IdsA.size() > IdsB.size() || |
1318 | (IdsA.size() == IdsB.size() && IdsA < IdsB); |
1319 | }); |
1320 | |
1321 | // Find the node for the last stack id, which should be the same |
1322 | // across all calls recorded for this id, and is the id for this |
1323 | // entry in the StackIdToMatchingCalls map. |
1324 | uint64_t LastId = It.getFirst(); |
1325 | ContextNode *LastNode = getNodeForStackId(StackId: LastId); |
1326 | // We should only have kept stack ids that had nodes. |
1327 | assert(LastNode); |
1328 | |
1329 | if (LastNode->Recursive) |
1330 | continue; |
1331 | |
1332 | // Initialize the context ids with the last node's. We will subsequently |
1333 | // refine the context ids by computing the intersection along all edges. |
1334 | DenseSet<uint32_t> LastNodeContextIds = LastNode->ContextIds; |
1335 | assert(!LastNodeContextIds.empty()); |
1336 | |
1337 | for (unsigned I = 0; I < Calls.size(); I++) { |
1338 | auto &[Call, Ids, Func, SavedContextIds] = Calls[I]; |
1339 | assert(SavedContextIds.empty()); |
1340 | assert(LastId == Ids.back()); |
1341 | |
1342 | // First compute the context ids for this stack id sequence (the |
1343 | // intersection of the context ids of the corresponding nodes). |
1344 | // Start with the remaining saved ids for the last node. |
1345 | assert(!LastNodeContextIds.empty()); |
1346 | DenseSet<uint32_t> StackSequenceContextIds = LastNodeContextIds; |
1347 | |
1348 | ContextNode *PrevNode = LastNode; |
1349 | ContextNode *CurNode = LastNode; |
1350 | bool Skip = false; |
1351 | |
1352 | // Iterate backwards through the stack Ids, starting after the last Id |
1353 | // in the list, which was handled once outside for all Calls. |
1354 | for (auto IdIter = Ids.rbegin() + 1; IdIter != Ids.rend(); IdIter++) { |
1355 | auto Id = *IdIter; |
1356 | CurNode = getNodeForStackId(StackId: Id); |
1357 | // We should only have kept stack ids that had nodes. |
1358 | assert(CurNode); |
1359 | |
1360 | if (CurNode->Recursive) { |
1361 | Skip = true; |
1362 | break; |
1363 | } |
1364 | |
1365 | auto *Edge = CurNode->findEdgeFromCaller(PrevNode); |
1366 | // If there is no edge then the nodes belong to different MIB contexts, |
1367 | // and we should skip this inlined context sequence. For example, this |
1368 | // particular inlined context may include stack ids A->B, and we may |
1369 | // indeed have nodes for both A and B, but it is possible that they were |
1370 | // never profiled in sequence in a single MIB for any allocation (i.e. |
1371 | // we might have profiled an allocation that involves the callsite A, |
1372 | // but through a different one of its callee callsites, and we might |
1373 | // have profiled an allocation that involves callsite B, but reached |
1374 | // from a different caller callsite). |
1375 | if (!Edge) { |
1376 | Skip = true; |
1377 | break; |
1378 | } |
1379 | PrevNode = CurNode; |
1380 | |
1381 | // Update the context ids, which is the intersection of the ids along |
1382 | // all edges in the sequence. |
1383 | set_intersect(StackSequenceContextIds, Edge->getContextIds()); |
1384 | |
1385 | // If we now have no context ids for clone, skip this call. |
1386 | if (StackSequenceContextIds.empty()) { |
1387 | Skip = true; |
1388 | break; |
1389 | } |
1390 | } |
1391 | if (Skip) |
1392 | continue; |
1393 | |
1394 | // If some of this call's stack ids did not have corresponding nodes (due |
1395 | // to pruning), don't include any context ids for contexts that extend |
1396 | // beyond these nodes. Otherwise we would be matching part of unrelated / |
1397 | // not fully matching stack contexts. To do this, subtract any context ids |
1398 | // found in caller nodes of the last node found above. |
1399 | if (Ids.back() != getLastStackId(Call)) { |
1400 | for (const auto &PE : LastNode->CallerEdges) { |
1401 | set_subtract(StackSequenceContextIds, PE->getContextIds()); |
1402 | if (StackSequenceContextIds.empty()) |
1403 | break; |
1404 | } |
1405 | // If we now have no context ids for clone, skip this call. |
1406 | if (StackSequenceContextIds.empty()) |
1407 | continue; |
1408 | } |
1409 | |
1410 | // Check if the next set of stack ids is the same (since the Calls vector |
1411 | // of tuples is sorted by the stack ids we can just look at the next one). |
1412 | bool DuplicateContextIds = false; |
1413 | if (I + 1 < Calls.size()) { |
1414 | auto NextIds = std::get<1>(Calls[I + 1]); |
1415 | DuplicateContextIds = Ids == NextIds; |
1416 | } |
1417 | |
1418 | // If we don't have duplicate context ids, then we can assign all the |
1419 | // context ids computed for the original node sequence to this call. |
1420 | // If there are duplicate calls with the same stack ids then we synthesize |
1421 | // new context ids that are duplicates of the originals. These are |
1422 | // assigned to SavedContextIds, which is a reference into the map entry |
1423 | // for this call, allowing us to access these ids later on. |
1424 | OldToNewContextIds.reserve(NumEntries: OldToNewContextIds.size() + |
1425 | StackSequenceContextIds.size()); |
1426 | SavedContextIds = |
1427 | DuplicateContextIds |
1428 | ? duplicateContextIds(StackSequenceContextIds, OldToNewContextIds) |
1429 | : StackSequenceContextIds; |
1430 | assert(!SavedContextIds.empty()); |
1431 | |
1432 | if (!DuplicateContextIds) { |
1433 | // Update saved last node's context ids to remove those that are |
1434 | // assigned to other calls, so that it is ready for the next call at |
1435 | // this stack id. |
1436 | set_subtract(S1&: LastNodeContextIds, S2: StackSequenceContextIds); |
1437 | if (LastNodeContextIds.empty()) |
1438 | break; |
1439 | } |
1440 | } |
1441 | } |
1442 | |
1443 | // Propagate the duplicate context ids over the graph. |
1444 | propagateDuplicateContextIds(OldToNewContextIds); |
1445 | |
1446 | if (VerifyCCG) |
1447 | check(); |
1448 | |
1449 | // Now perform a post-order traversal over the graph, starting with the |
1450 | // allocation nodes, essentially processing nodes from callers to callees. |
1451 | // For any that contains an id in the map, update the graph to contain new |
1452 | // nodes representing any inlining at interior callsites. Note we move the |
1453 | // associated context ids over to the new nodes. |
1454 | DenseSet<const ContextNode *> Visited; |
1455 | for (auto &Entry : AllocationCallToContextNodeMap) |
1456 | assignStackNodesPostOrder(Node: Entry.second, Visited, StackIdToMatchingCalls); |
1457 | } |
1458 | |
1459 | uint64_t ModuleCallsiteContextGraph::getLastStackId(Instruction *Call) { |
1460 | CallStack<MDNode, MDNode::op_iterator> CallsiteContext( |
1461 | Call->getMetadata(KindID: LLVMContext::MD_callsite)); |
1462 | return CallsiteContext.back(); |
1463 | } |
1464 | |
1465 | uint64_t IndexCallsiteContextGraph::getLastStackId(IndexCall &Call) { |
1466 | assert(isa<CallsiteInfo *>(Call.getBase())); |
1467 | CallStack<CallsiteInfo, SmallVector<unsigned>::const_iterator> |
1468 | CallsiteContext(dyn_cast_if_present<CallsiteInfo *>(Val: Call.getBase())); |
1469 | // Need to convert index into stack id. |
1470 | return Index.getStackIdAtIndex(Index: CallsiteContext.back()); |
1471 | } |
1472 | |
1473 | static const std::string MemProfCloneSuffix = ".memprof." ; |
1474 | |
1475 | static std::string getMemProfFuncName(Twine Base, unsigned CloneNo) { |
1476 | // We use CloneNo == 0 to refer to the original version, which doesn't get |
1477 | // renamed with a suffix. |
1478 | if (!CloneNo) |
1479 | return Base.str(); |
1480 | return (Base + MemProfCloneSuffix + Twine(CloneNo)).str(); |
1481 | } |
1482 | |
1483 | std::string ModuleCallsiteContextGraph::getLabel(const Function *Func, |
1484 | const Instruction *Call, |
1485 | unsigned CloneNo) const { |
1486 | return (Twine(Call->getFunction()->getName()) + " -> " + |
1487 | cast<CallBase>(Val: Call)->getCalledFunction()->getName()) |
1488 | .str(); |
1489 | } |
1490 | |
1491 | std::string IndexCallsiteContextGraph::getLabel(const FunctionSummary *Func, |
1492 | const IndexCall &Call, |
1493 | unsigned CloneNo) const { |
1494 | auto VI = FSToVIMap.find(x: Func); |
1495 | assert(VI != FSToVIMap.end()); |
1496 | if (isa<AllocInfo *>(Val: Call.getBase())) |
1497 | return (VI->second.name() + " -> alloc" ).str(); |
1498 | else { |
1499 | auto *Callsite = dyn_cast_if_present<CallsiteInfo *>(Val: Call.getBase()); |
1500 | return (VI->second.name() + " -> " + |
1501 | getMemProfFuncName(Base: Callsite->Callee.name(), |
1502 | CloneNo: Callsite->Clones[CloneNo])) |
1503 | .str(); |
1504 | } |
1505 | } |
1506 | |
1507 | std::vector<uint64_t> |
1508 | ModuleCallsiteContextGraph::getStackIdsWithContextNodesForCall( |
1509 | Instruction *Call) { |
1510 | CallStack<MDNode, MDNode::op_iterator> CallsiteContext( |
1511 | Call->getMetadata(KindID: LLVMContext::MD_callsite)); |
1512 | return getStackIdsWithContextNodes<MDNode, MDNode::op_iterator>( |
1513 | CallsiteContext); |
1514 | } |
1515 | |
1516 | std::vector<uint64_t> |
1517 | IndexCallsiteContextGraph::getStackIdsWithContextNodesForCall(IndexCall &Call) { |
1518 | assert(isa<CallsiteInfo *>(Call.getBase())); |
1519 | CallStack<CallsiteInfo, SmallVector<unsigned>::const_iterator> |
1520 | CallsiteContext(dyn_cast_if_present<CallsiteInfo *>(Val: Call.getBase())); |
1521 | return getStackIdsWithContextNodes<CallsiteInfo, |
1522 | SmallVector<unsigned>::const_iterator>( |
1523 | CallsiteContext); |
1524 | } |
1525 | |
1526 | template <typename DerivedCCG, typename FuncTy, typename CallTy> |
1527 | template <class NodeT, class IteratorT> |
1528 | std::vector<uint64_t> |
1529 | CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::getStackIdsWithContextNodes( |
1530 | CallStack<NodeT, IteratorT> &CallsiteContext) { |
1531 | std::vector<uint64_t> StackIds; |
1532 | for (auto IdOrIndex : CallsiteContext) { |
1533 | auto StackId = getStackId(IdOrIndex); |
1534 | ContextNode *Node = getNodeForStackId(StackId); |
1535 | if (!Node) |
1536 | break; |
1537 | StackIds.push_back(StackId); |
1538 | } |
1539 | return StackIds; |
1540 | } |
1541 | |
1542 | ModuleCallsiteContextGraph::( |
1543 | Module &M, |
1544 | llvm::function_ref<OptimizationRemarkEmitter &(Function *)> OREGetter) |
1545 | : Mod(M), OREGetter(OREGetter) { |
1546 | for (auto &F : M) { |
1547 | std::vector<CallInfo> CallsWithMetadata; |
1548 | for (auto &BB : F) { |
1549 | for (auto &I : BB) { |
1550 | if (!isa<CallBase>(Val: I)) |
1551 | continue; |
1552 | if (auto *MemProfMD = I.getMetadata(KindID: LLVMContext::MD_memprof)) { |
1553 | CallsWithMetadata.push_back(x: &I); |
1554 | auto *AllocNode = addAllocNode(Call: &I, F: &F); |
1555 | auto *CallsiteMD = I.getMetadata(KindID: LLVMContext::MD_callsite); |
1556 | assert(CallsiteMD); |
1557 | CallStack<MDNode, MDNode::op_iterator> CallsiteContext(CallsiteMD); |
1558 | // Add all of the MIBs and their stack nodes. |
1559 | for (auto &MDOp : MemProfMD->operands()) { |
1560 | auto *MIBMD = cast<const MDNode>(Val: MDOp); |
1561 | MDNode *StackNode = getMIBStackNode(MIB: MIBMD); |
1562 | assert(StackNode); |
1563 | CallStack<MDNode, MDNode::op_iterator> StackContext(StackNode); |
1564 | addStackNodesForMIB<MDNode, MDNode::op_iterator>( |
1565 | AllocNode, StackContext, CallsiteContext, |
1566 | AllocType: getMIBAllocType(MIB: MIBMD)); |
1567 | } |
1568 | assert(AllocNode->AllocTypes != (uint8_t)AllocationType::None); |
1569 | // Memprof and callsite metadata on memory allocations no longer |
1570 | // needed. |
1571 | I.setMetadata(KindID: LLVMContext::MD_memprof, Node: nullptr); |
1572 | I.setMetadata(KindID: LLVMContext::MD_callsite, Node: nullptr); |
1573 | } |
1574 | // For callsite metadata, add to list for this function for later use. |
1575 | else if (I.getMetadata(KindID: LLVMContext::MD_callsite)) |
1576 | CallsWithMetadata.push_back(x: &I); |
1577 | } |
1578 | } |
1579 | if (!CallsWithMetadata.empty()) |
1580 | FuncToCallsWithMetadata[&F] = CallsWithMetadata; |
1581 | } |
1582 | |
1583 | if (DumpCCG) { |
1584 | dbgs() << "CCG before updating call stack chains:\n" ; |
1585 | dbgs() << *this; |
1586 | } |
1587 | |
1588 | if (ExportToDot) |
1589 | exportToDot(Label: "prestackupdate" ); |
1590 | |
1591 | updateStackNodes(); |
1592 | |
1593 | handleCallsitesWithMultipleTargets(); |
1594 | |
1595 | // Strip off remaining callsite metadata, no longer needed. |
1596 | for (auto &FuncEntry : FuncToCallsWithMetadata) |
1597 | for (auto &Call : FuncEntry.second) |
1598 | Call.call()->setMetadata(KindID: LLVMContext::MD_callsite, Node: nullptr); |
1599 | } |
1600 | |
1601 | IndexCallsiteContextGraph::IndexCallsiteContextGraph( |
1602 | ModuleSummaryIndex &Index, |
1603 | llvm::function_ref<bool(GlobalValue::GUID, const GlobalValueSummary *)> |
1604 | isPrevailing) |
1605 | : Index(Index), isPrevailing(isPrevailing) { |
1606 | for (auto &I : Index) { |
1607 | auto VI = Index.getValueInfo(R: I); |
1608 | for (auto &S : VI.getSummaryList()) { |
1609 | // We should only add the prevailing nodes. Otherwise we may try to clone |
1610 | // in a weak copy that won't be linked (and may be different than the |
1611 | // prevailing version). |
1612 | // We only keep the memprof summary on the prevailing copy now when |
1613 | // building the combined index, as a space optimization, however don't |
1614 | // rely on this optimization. The linker doesn't resolve local linkage |
1615 | // values so don't check whether those are prevailing. |
1616 | if (!GlobalValue::isLocalLinkage(Linkage: S->linkage()) && |
1617 | !isPrevailing(VI.getGUID(), S.get())) |
1618 | continue; |
1619 | auto *FS = dyn_cast<FunctionSummary>(Val: S.get()); |
1620 | if (!FS) |
1621 | continue; |
1622 | std::vector<CallInfo> CallsWithMetadata; |
1623 | if (!FS->allocs().empty()) { |
1624 | for (auto &AN : FS->mutableAllocs()) { |
1625 | // This can happen because of recursion elimination handling that |
1626 | // currently exists in ModuleSummaryAnalysis. Skip these for now. |
1627 | // We still added them to the summary because we need to be able to |
1628 | // correlate properly in applyImport in the backends. |
1629 | if (AN.MIBs.empty()) |
1630 | continue; |
1631 | CallsWithMetadata.push_back(x: {&AN}); |
1632 | auto *AllocNode = addAllocNode(Call: {&AN}, F: FS); |
1633 | // Pass an empty CallStack to the CallsiteContext (second) |
1634 | // parameter, since for ThinLTO we already collapsed out the inlined |
1635 | // stack ids on the allocation call during ModuleSummaryAnalysis. |
1636 | CallStack<MIBInfo, SmallVector<unsigned>::const_iterator> |
1637 | EmptyContext; |
1638 | // Now add all of the MIBs and their stack nodes. |
1639 | for (auto &MIB : AN.MIBs) { |
1640 | CallStack<MIBInfo, SmallVector<unsigned>::const_iterator> |
1641 | StackContext(&MIB); |
1642 | addStackNodesForMIB<MIBInfo, SmallVector<unsigned>::const_iterator>( |
1643 | AllocNode, StackContext, CallsiteContext&: EmptyContext, AllocType: MIB.AllocType); |
1644 | } |
1645 | assert(AllocNode->AllocTypes != (uint8_t)AllocationType::None); |
1646 | // Initialize version 0 on the summary alloc node to the current alloc |
1647 | // type, unless it has both types in which case make it default, so |
1648 | // that in the case where we aren't able to clone the original version |
1649 | // always ends up with the default allocation behavior. |
1650 | AN.Versions[0] = (uint8_t)allocTypeToUse(AllocTypes: AllocNode->AllocTypes); |
1651 | } |
1652 | } |
1653 | // For callsite metadata, add to list for this function for later use. |
1654 | if (!FS->callsites().empty()) |
1655 | for (auto &SN : FS->mutableCallsites()) |
1656 | CallsWithMetadata.push_back(x: {&SN}); |
1657 | |
1658 | if (!CallsWithMetadata.empty()) |
1659 | FuncToCallsWithMetadata[FS] = CallsWithMetadata; |
1660 | |
1661 | if (!FS->allocs().empty() || !FS->callsites().empty()) |
1662 | FSToVIMap[FS] = VI; |
1663 | } |
1664 | } |
1665 | |
1666 | if (DumpCCG) { |
1667 | dbgs() << "CCG before updating call stack chains:\n" ; |
1668 | dbgs() << *this; |
1669 | } |
1670 | |
1671 | if (ExportToDot) |
1672 | exportToDot(Label: "prestackupdate" ); |
1673 | |
1674 | updateStackNodes(); |
1675 | |
1676 | handleCallsitesWithMultipleTargets(); |
1677 | } |
1678 | |
1679 | template <typename DerivedCCG, typename FuncTy, typename CallTy> |
1680 | void CallsiteContextGraph<DerivedCCG, FuncTy, |
1681 | CallTy>::handleCallsitesWithMultipleTargets() { |
1682 | // Look for and workaround callsites that call multiple functions. |
1683 | // This can happen for indirect calls, which needs better handling, and in |
1684 | // more rare cases (e.g. macro expansion). |
1685 | // TODO: To fix this for indirect calls we will want to perform speculative |
1686 | // devirtualization using either the normal PGO info with ICP, or using the |
1687 | // information in the profiled MemProf contexts. We can do this prior to |
1688 | // this transformation for regular LTO, and for ThinLTO we can simulate that |
1689 | // effect in the summary and perform the actual speculative devirtualization |
1690 | // while cloning in the ThinLTO backend. |
1691 | |
1692 | // Keep track of the new nodes synthesized for discovered tail calls missing |
1693 | // from the profiled contexts. |
1694 | MapVector<CallInfo, ContextNode *> TailCallToContextNodeMap; |
1695 | |
1696 | for (auto Entry = NonAllocationCallToContextNodeMap.begin(); |
1697 | Entry != NonAllocationCallToContextNodeMap.end();) { |
1698 | auto *Node = Entry->second; |
1699 | assert(Node->Clones.empty()); |
1700 | // Check all node callees and see if in the same function. |
1701 | bool Removed = false; |
1702 | auto Call = Node->Call.call(); |
1703 | for (auto EI = Node->CalleeEdges.begin(); EI != Node->CalleeEdges.end();) { |
1704 | auto Edge = *EI; |
1705 | if (!Edge->Callee->hasCall()) { |
1706 | ++EI; |
1707 | continue; |
1708 | } |
1709 | assert(NodeToCallingFunc.count(Edge->Callee)); |
1710 | // Check if the called function matches that of the callee node. |
1711 | if (calleesMatch(Call, EI, TailCallToContextNodeMap)) |
1712 | continue; |
1713 | RemovedEdgesWithMismatchedCallees++; |
1714 | // Work around by setting Node to have a null call, so it gets |
1715 | // skipped during cloning. Otherwise assignFunctions will assert |
1716 | // because its data structures are not designed to handle this case. |
1717 | Entry = NonAllocationCallToContextNodeMap.erase(Entry); |
1718 | Node->setCall(CallInfo()); |
1719 | Removed = true; |
1720 | break; |
1721 | } |
1722 | if (!Removed) |
1723 | Entry++; |
1724 | } |
1725 | |
1726 | // Add the new nodes after the above loop so that the iteration is not |
1727 | // invalidated. |
1728 | for (auto &[Call, Node] : TailCallToContextNodeMap) |
1729 | NonAllocationCallToContextNodeMap[Call] = Node; |
1730 | } |
1731 | |
1732 | uint64_t ModuleCallsiteContextGraph::getStackId(uint64_t IdOrIndex) const { |
1733 | // In the Module (IR) case this is already the Id. |
1734 | return IdOrIndex; |
1735 | } |
1736 | |
1737 | uint64_t IndexCallsiteContextGraph::getStackId(uint64_t IdOrIndex) const { |
1738 | // In the Index case this is an index into the stack id list in the summary |
1739 | // index, convert it to an Id. |
1740 | return Index.getStackIdAtIndex(Index: IdOrIndex); |
1741 | } |
1742 | |
1743 | template <typename DerivedCCG, typename FuncTy, typename CallTy> |
1744 | bool CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::calleesMatch( |
1745 | CallTy Call, EdgeIter &EI, |
1746 | MapVector<CallInfo, ContextNode *> &TailCallToContextNodeMap) { |
1747 | auto Edge = *EI; |
1748 | const FuncTy *ProfiledCalleeFunc = NodeToCallingFunc[Edge->Callee]; |
1749 | const FuncTy *CallerFunc = NodeToCallingFunc[Edge->Caller]; |
1750 | // Will be populated in order of callee to caller if we find a chain of tail |
1751 | // calls between the profiled caller and callee. |
1752 | std::vector<std::pair<CallTy, FuncTy *>> FoundCalleeChain; |
1753 | if (!calleeMatchesFunc(Call, Func: ProfiledCalleeFunc, CallerFunc, |
1754 | FoundCalleeChain)) { |
1755 | ++EI; |
1756 | return false; |
1757 | } |
1758 | |
1759 | // The usual case where the profiled callee matches that of the IR/summary. |
1760 | if (FoundCalleeChain.empty()) { |
1761 | ++EI; |
1762 | return true; |
1763 | } |
1764 | |
1765 | auto AddEdge = [Edge, &EI](ContextNode *Caller, ContextNode *Callee) { |
1766 | auto *CurEdge = Callee->findEdgeFromCaller(Caller); |
1767 | // If there is already an edge between these nodes, simply update it and |
1768 | // return. |
1769 | if (CurEdge) { |
1770 | CurEdge->ContextIds.insert(Edge->ContextIds.begin(), |
1771 | Edge->ContextIds.end()); |
1772 | CurEdge->AllocTypes |= Edge->AllocTypes; |
1773 | return; |
1774 | } |
1775 | // Otherwise, create a new edge and insert it into the caller and callee |
1776 | // lists. |
1777 | auto NewEdge = std::make_shared<ContextEdge>( |
1778 | Callee, Caller, Edge->AllocTypes, Edge->ContextIds); |
1779 | Callee->CallerEdges.push_back(NewEdge); |
1780 | if (Caller == Edge->Caller) { |
1781 | // If we are inserting the new edge into the current edge's caller, insert |
1782 | // the new edge before the current iterator position, and then increment |
1783 | // back to the current edge. |
1784 | EI = Caller->CalleeEdges.insert(EI, NewEdge); |
1785 | ++EI; |
1786 | assert(*EI == Edge && |
1787 | "Iterator position not restored after insert and increment" ); |
1788 | } else |
1789 | Caller->CalleeEdges.push_back(NewEdge); |
1790 | }; |
1791 | |
1792 | // Create new nodes for each found callee and connect in between the profiled |
1793 | // caller and callee. |
1794 | auto *CurCalleeNode = Edge->Callee; |
1795 | for (auto &[NewCall, Func] : FoundCalleeChain) { |
1796 | ContextNode *NewNode = nullptr; |
1797 | // First check if we have already synthesized a node for this tail call. |
1798 | if (TailCallToContextNodeMap.count(NewCall)) { |
1799 | NewNode = TailCallToContextNodeMap[NewCall]; |
1800 | NewNode->ContextIds.insert(Edge->ContextIds.begin(), |
1801 | Edge->ContextIds.end()); |
1802 | NewNode->AllocTypes |= Edge->AllocTypes; |
1803 | } else { |
1804 | FuncToCallsWithMetadata[Func].push_back({NewCall}); |
1805 | // Create Node and record node info. |
1806 | NodeOwner.push_back( |
1807 | std::make_unique<ContextNode>(/*IsAllocation=*/false, NewCall)); |
1808 | NewNode = NodeOwner.back().get(); |
1809 | NodeToCallingFunc[NewNode] = Func; |
1810 | TailCallToContextNodeMap[NewCall] = NewNode; |
1811 | NewNode->ContextIds = Edge->ContextIds; |
1812 | NewNode->AllocTypes = Edge->AllocTypes; |
1813 | } |
1814 | |
1815 | // Hook up node to its callee node |
1816 | AddEdge(NewNode, CurCalleeNode); |
1817 | |
1818 | CurCalleeNode = NewNode; |
1819 | } |
1820 | |
1821 | // Hook up edge's original caller to new callee node. |
1822 | AddEdge(Edge->Caller, CurCalleeNode); |
1823 | |
1824 | // Remove old edge |
1825 | Edge->Callee->eraseCallerEdge(Edge.get()); |
1826 | EI = Edge->Caller->CalleeEdges.erase(EI); |
1827 | |
1828 | return true; |
1829 | } |
1830 | |
1831 | bool ModuleCallsiteContextGraph::findProfiledCalleeThroughTailCalls( |
1832 | const Function *ProfiledCallee, Value *CurCallee, unsigned Depth, |
1833 | std::vector<std::pair<Instruction *, Function *>> &FoundCalleeChain, |
1834 | bool &FoundMultipleCalleeChains) { |
1835 | // Stop recursive search if we have already explored the maximum specified |
1836 | // depth. |
1837 | if (Depth > TailCallSearchDepth) |
1838 | return false; |
1839 | |
1840 | auto SaveCallsiteInfo = [&](Instruction *Callsite, Function *F) { |
1841 | FoundCalleeChain.push_back(x: {Callsite, F}); |
1842 | }; |
1843 | |
1844 | auto *CalleeFunc = dyn_cast<Function>(Val: CurCallee); |
1845 | if (!CalleeFunc) { |
1846 | auto *Alias = dyn_cast<GlobalAlias>(Val: CurCallee); |
1847 | assert(Alias); |
1848 | CalleeFunc = dyn_cast<Function>(Val: Alias->getAliasee()); |
1849 | assert(CalleeFunc); |
1850 | } |
1851 | |
1852 | // Look for tail calls in this function, and check if they either call the |
1853 | // profiled callee directly, or indirectly (via a recursive search). |
1854 | // Only succeed if there is a single unique tail call chain found between the |
1855 | // profiled caller and callee, otherwise we could perform incorrect cloning. |
1856 | bool FoundSingleCalleeChain = false; |
1857 | for (auto &BB : *CalleeFunc) { |
1858 | for (auto &I : BB) { |
1859 | auto *CB = dyn_cast<CallBase>(Val: &I); |
1860 | if (!CB || !CB->isTailCall()) |
1861 | continue; |
1862 | auto *CalledValue = CB->getCalledOperand(); |
1863 | auto *CalledFunction = CB->getCalledFunction(); |
1864 | if (CalledValue && !CalledFunction) { |
1865 | CalledValue = CalledValue->stripPointerCasts(); |
1866 | // Stripping pointer casts can reveal a called function. |
1867 | CalledFunction = dyn_cast<Function>(Val: CalledValue); |
1868 | } |
1869 | // Check if this is an alias to a function. If so, get the |
1870 | // called aliasee for the checks below. |
1871 | if (auto *GA = dyn_cast<GlobalAlias>(Val: CalledValue)) { |
1872 | assert(!CalledFunction && |
1873 | "Expected null called function in callsite for alias" ); |
1874 | CalledFunction = dyn_cast<Function>(Val: GA->getAliaseeObject()); |
1875 | } |
1876 | if (!CalledFunction) |
1877 | continue; |
1878 | if (CalledFunction == ProfiledCallee) { |
1879 | if (FoundSingleCalleeChain) { |
1880 | FoundMultipleCalleeChains = true; |
1881 | return false; |
1882 | } |
1883 | FoundSingleCalleeChain = true; |
1884 | FoundProfiledCalleeCount++; |
1885 | FoundProfiledCalleeDepth += Depth; |
1886 | if (Depth > FoundProfiledCalleeMaxDepth) |
1887 | FoundProfiledCalleeMaxDepth = Depth; |
1888 | SaveCallsiteInfo(&I, CalleeFunc); |
1889 | } else if (findProfiledCalleeThroughTailCalls( |
1890 | ProfiledCallee, CurCallee: CalledFunction, Depth: Depth + 1, |
1891 | FoundCalleeChain, FoundMultipleCalleeChains)) { |
1892 | if (FoundMultipleCalleeChains) |
1893 | return false; |
1894 | if (FoundSingleCalleeChain) { |
1895 | FoundMultipleCalleeChains = true; |
1896 | return false; |
1897 | } |
1898 | FoundSingleCalleeChain = true; |
1899 | SaveCallsiteInfo(&I, CalleeFunc); |
1900 | } |
1901 | } |
1902 | } |
1903 | |
1904 | return FoundSingleCalleeChain; |
1905 | } |
1906 | |
1907 | bool ModuleCallsiteContextGraph::calleeMatchesFunc( |
1908 | Instruction *Call, const Function *Func, const Function *CallerFunc, |
1909 | std::vector<std::pair<Instruction *, Function *>> &FoundCalleeChain) { |
1910 | auto *CB = dyn_cast<CallBase>(Val: Call); |
1911 | if (!CB->getCalledOperand()) |
1912 | return false; |
1913 | auto *CalleeVal = CB->getCalledOperand()->stripPointerCasts(); |
1914 | auto *CalleeFunc = dyn_cast<Function>(Val: CalleeVal); |
1915 | if (CalleeFunc == Func) |
1916 | return true; |
1917 | auto *Alias = dyn_cast<GlobalAlias>(Val: CalleeVal); |
1918 | if (Alias && Alias->getAliasee() == Func) |
1919 | return true; |
1920 | |
1921 | // Recursively search for the profiled callee through tail calls starting with |
1922 | // the actual Callee. The discovered tail call chain is saved in |
1923 | // FoundCalleeChain, and we will fixup the graph to include these callsites |
1924 | // after returning. |
1925 | // FIXME: We will currently redo the same recursive walk if we find the same |
1926 | // mismatched callee from another callsite. We can improve this with more |
1927 | // bookkeeping of the created chain of new nodes for each mismatch. |
1928 | unsigned Depth = 1; |
1929 | bool FoundMultipleCalleeChains = false; |
1930 | if (!findProfiledCalleeThroughTailCalls(ProfiledCallee: Func, CurCallee: CalleeVal, Depth, |
1931 | FoundCalleeChain, |
1932 | FoundMultipleCalleeChains)) { |
1933 | LLVM_DEBUG(dbgs() << "Not found through unique tail call chain: " |
1934 | << Func->getName() << " from " << CallerFunc->getName() |
1935 | << " that actually called " << CalleeVal->getName() |
1936 | << (FoundMultipleCalleeChains |
1937 | ? " (found multiple possible chains)" |
1938 | : "" ) |
1939 | << "\n" ); |
1940 | if (FoundMultipleCalleeChains) |
1941 | FoundProfiledCalleeNonUniquelyCount++; |
1942 | return false; |
1943 | } |
1944 | |
1945 | return true; |
1946 | } |
1947 | |
1948 | bool IndexCallsiteContextGraph::findProfiledCalleeThroughTailCalls( |
1949 | ValueInfo ProfiledCallee, ValueInfo CurCallee, unsigned Depth, |
1950 | std::vector<std::pair<IndexCall, FunctionSummary *>> &FoundCalleeChain, |
1951 | bool &FoundMultipleCalleeChains) { |
1952 | // Stop recursive search if we have already explored the maximum specified |
1953 | // depth. |
1954 | if (Depth > TailCallSearchDepth) |
1955 | return false; |
1956 | |
1957 | auto CreateAndSaveCallsiteInfo = [&](ValueInfo Callee, FunctionSummary *FS) { |
1958 | // Make a CallsiteInfo for each discovered callee, if one hasn't already |
1959 | // been synthesized. |
1960 | if (!FunctionCalleesToSynthesizedCallsiteInfos.count(x: FS) || |
1961 | !FunctionCalleesToSynthesizedCallsiteInfos[FS].count(x: Callee)) |
1962 | // StackIds is empty (we don't have debug info available in the index for |
1963 | // these callsites) |
1964 | FunctionCalleesToSynthesizedCallsiteInfos[FS][Callee] = |
1965 | std::make_unique<CallsiteInfo>(args&: Callee, args: SmallVector<unsigned>()); |
1966 | CallsiteInfo *NewCallsiteInfo = |
1967 | FunctionCalleesToSynthesizedCallsiteInfos[FS][Callee].get(); |
1968 | FoundCalleeChain.push_back(x: {NewCallsiteInfo, FS}); |
1969 | }; |
1970 | |
1971 | // Look for tail calls in this function, and check if they either call the |
1972 | // profiled callee directly, or indirectly (via a recursive search). |
1973 | // Only succeed if there is a single unique tail call chain found between the |
1974 | // profiled caller and callee, otherwise we could perform incorrect cloning. |
1975 | bool FoundSingleCalleeChain = false; |
1976 | for (auto &S : CurCallee.getSummaryList()) { |
1977 | if (!GlobalValue::isLocalLinkage(Linkage: S->linkage()) && |
1978 | !isPrevailing(CurCallee.getGUID(), S.get())) |
1979 | continue; |
1980 | auto *FS = dyn_cast<FunctionSummary>(Val: S->getBaseObject()); |
1981 | if (!FS) |
1982 | continue; |
1983 | auto FSVI = CurCallee; |
1984 | auto *AS = dyn_cast<AliasSummary>(Val: S.get()); |
1985 | if (AS) |
1986 | FSVI = AS->getAliaseeVI(); |
1987 | for (auto &CallEdge : FS->calls()) { |
1988 | if (!CallEdge.second.hasTailCall()) |
1989 | continue; |
1990 | if (CallEdge.first == ProfiledCallee) { |
1991 | if (FoundSingleCalleeChain) { |
1992 | FoundMultipleCalleeChains = true; |
1993 | return false; |
1994 | } |
1995 | FoundSingleCalleeChain = true; |
1996 | FoundProfiledCalleeCount++; |
1997 | FoundProfiledCalleeDepth += Depth; |
1998 | if (Depth > FoundProfiledCalleeMaxDepth) |
1999 | FoundProfiledCalleeMaxDepth = Depth; |
2000 | CreateAndSaveCallsiteInfo(CallEdge.first, FS); |
2001 | // Add FS to FSToVIMap in case it isn't already there. |
2002 | assert(!FSToVIMap.count(FS) || FSToVIMap[FS] == FSVI); |
2003 | FSToVIMap[FS] = FSVI; |
2004 | } else if (findProfiledCalleeThroughTailCalls( |
2005 | ProfiledCallee, CurCallee: CallEdge.first, Depth: Depth + 1, |
2006 | FoundCalleeChain, FoundMultipleCalleeChains)) { |
2007 | if (FoundMultipleCalleeChains) |
2008 | return false; |
2009 | if (FoundSingleCalleeChain) { |
2010 | FoundMultipleCalleeChains = true; |
2011 | return false; |
2012 | } |
2013 | FoundSingleCalleeChain = true; |
2014 | CreateAndSaveCallsiteInfo(CallEdge.first, FS); |
2015 | // Add FS to FSToVIMap in case it isn't already there. |
2016 | assert(!FSToVIMap.count(FS) || FSToVIMap[FS] == FSVI); |
2017 | FSToVIMap[FS] = FSVI; |
2018 | } |
2019 | } |
2020 | } |
2021 | |
2022 | return FoundSingleCalleeChain; |
2023 | } |
2024 | |
2025 | bool IndexCallsiteContextGraph::calleeMatchesFunc( |
2026 | IndexCall &Call, const FunctionSummary *Func, |
2027 | const FunctionSummary *CallerFunc, |
2028 | std::vector<std::pair<IndexCall, FunctionSummary *>> &FoundCalleeChain) { |
2029 | ValueInfo Callee = |
2030 | dyn_cast_if_present<CallsiteInfo *>(Val: Call.getBase())->Callee; |
2031 | // If there is no summary list then this is a call to an externally defined |
2032 | // symbol. |
2033 | AliasSummary *Alias = |
2034 | Callee.getSummaryList().empty() |
2035 | ? nullptr |
2036 | : dyn_cast<AliasSummary>(Val: Callee.getSummaryList()[0].get()); |
2037 | assert(FSToVIMap.count(Func)); |
2038 | auto FuncVI = FSToVIMap[Func]; |
2039 | if (Callee == FuncVI || |
2040 | // If callee is an alias, check the aliasee, since only function |
2041 | // summary base objects will contain the stack node summaries and thus |
2042 | // get a context node. |
2043 | (Alias && Alias->getAliaseeVI() == FuncVI)) |
2044 | return true; |
2045 | |
2046 | // Recursively search for the profiled callee through tail calls starting with |
2047 | // the actual Callee. The discovered tail call chain is saved in |
2048 | // FoundCalleeChain, and we will fixup the graph to include these callsites |
2049 | // after returning. |
2050 | // FIXME: We will currently redo the same recursive walk if we find the same |
2051 | // mismatched callee from another callsite. We can improve this with more |
2052 | // bookkeeping of the created chain of new nodes for each mismatch. |
2053 | unsigned Depth = 1; |
2054 | bool FoundMultipleCalleeChains = false; |
2055 | if (!findProfiledCalleeThroughTailCalls( |
2056 | ProfiledCallee: FuncVI, CurCallee: Callee, Depth, FoundCalleeChain, FoundMultipleCalleeChains)) { |
2057 | LLVM_DEBUG(dbgs() << "Not found through unique tail call chain: " << FuncVI |
2058 | << " from " << FSToVIMap[CallerFunc] |
2059 | << " that actually called " << Callee |
2060 | << (FoundMultipleCalleeChains |
2061 | ? " (found multiple possible chains)" |
2062 | : "" ) |
2063 | << "\n" ); |
2064 | if (FoundMultipleCalleeChains) |
2065 | FoundProfiledCalleeNonUniquelyCount++; |
2066 | return false; |
2067 | } |
2068 | |
2069 | return true; |
2070 | } |
2071 | |
2072 | static std::string getAllocTypeString(uint8_t AllocTypes) { |
2073 | if (!AllocTypes) |
2074 | return "None" ; |
2075 | std::string Str; |
2076 | if (AllocTypes & (uint8_t)AllocationType::NotCold) |
2077 | Str += "NotCold" ; |
2078 | if (AllocTypes & (uint8_t)AllocationType::Cold) |
2079 | Str += "Cold" ; |
2080 | return Str; |
2081 | } |
2082 | |
2083 | template <typename DerivedCCG, typename FuncTy, typename CallTy> |
2084 | void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::ContextNode::dump() |
2085 | const { |
2086 | print(OS&: dbgs()); |
2087 | dbgs() << "\n" ; |
2088 | } |
2089 | |
2090 | template <typename DerivedCCG, typename FuncTy, typename CallTy> |
2091 | void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::ContextNode::print( |
2092 | raw_ostream &OS) const { |
2093 | OS << "Node " << this << "\n" ; |
2094 | OS << "\t" ; |
2095 | printCall(OS); |
2096 | if (Recursive) |
2097 | OS << " (recursive)" ; |
2098 | OS << "\n" ; |
2099 | OS << "\tAllocTypes: " << getAllocTypeString(AllocTypes) << "\n" ; |
2100 | OS << "\tContextIds:" ; |
2101 | std::vector<uint32_t> SortedIds(ContextIds.begin(), ContextIds.end()); |
2102 | std::sort(first: SortedIds.begin(), last: SortedIds.end()); |
2103 | for (auto Id : SortedIds) |
2104 | OS << " " << Id; |
2105 | OS << "\n" ; |
2106 | OS << "\tCalleeEdges:\n" ; |
2107 | for (auto &Edge : CalleeEdges) |
2108 | OS << "\t\t" << *Edge << "\n" ; |
2109 | OS << "\tCallerEdges:\n" ; |
2110 | for (auto &Edge : CallerEdges) |
2111 | OS << "\t\t" << *Edge << "\n" ; |
2112 | if (!Clones.empty()) { |
2113 | OS << "\tClones: " ; |
2114 | FieldSeparator FS; |
2115 | for (auto *Clone : Clones) |
2116 | OS << FS << Clone; |
2117 | OS << "\n" ; |
2118 | } else if (CloneOf) { |
2119 | OS << "\tClone of " << CloneOf << "\n" ; |
2120 | } |
2121 | } |
2122 | |
2123 | template <typename DerivedCCG, typename FuncTy, typename CallTy> |
2124 | void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::ContextEdge::dump() |
2125 | const { |
2126 | print(OS&: dbgs()); |
2127 | dbgs() << "\n" ; |
2128 | } |
2129 | |
2130 | template <typename DerivedCCG, typename FuncTy, typename CallTy> |
2131 | void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::ContextEdge::print( |
2132 | raw_ostream &OS) const { |
2133 | OS << "Edge from Callee " << Callee << " to Caller: " << Caller |
2134 | << " AllocTypes: " << getAllocTypeString(AllocTypes); |
2135 | OS << " ContextIds:" ; |
2136 | std::vector<uint32_t> SortedIds(ContextIds.begin(), ContextIds.end()); |
2137 | std::sort(first: SortedIds.begin(), last: SortedIds.end()); |
2138 | for (auto Id : SortedIds) |
2139 | OS << " " << Id; |
2140 | } |
2141 | |
2142 | template <typename DerivedCCG, typename FuncTy, typename CallTy> |
2143 | void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::dump() const { |
2144 | print(OS&: dbgs()); |
2145 | } |
2146 | |
2147 | template <typename DerivedCCG, typename FuncTy, typename CallTy> |
2148 | void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::print( |
2149 | raw_ostream &OS) const { |
2150 | OS << "Callsite Context Graph:\n" ; |
2151 | using GraphType = const CallsiteContextGraph<DerivedCCG, FuncTy, CallTy> *; |
2152 | for (const auto Node : nodes<GraphType>(this)) { |
2153 | if (Node->isRemoved()) |
2154 | continue; |
2155 | Node->print(OS); |
2156 | OS << "\n" ; |
2157 | } |
2158 | } |
2159 | |
2160 | template <typename DerivedCCG, typename FuncTy, typename CallTy> |
2161 | static void checkEdge( |
2162 | const std::shared_ptr<ContextEdge<DerivedCCG, FuncTy, CallTy>> &Edge) { |
2163 | // Confirm that alloc type is not None and that we have at least one context |
2164 | // id. |
2165 | assert(Edge->AllocTypes != (uint8_t)AllocationType::None); |
2166 | assert(!Edge->ContextIds.empty()); |
2167 | } |
2168 | |
2169 | template <typename DerivedCCG, typename FuncTy, typename CallTy> |
2170 | static void checkNode(const ContextNode<DerivedCCG, FuncTy, CallTy> *Node, |
2171 | bool CheckEdges = true) { |
2172 | if (Node->isRemoved()) |
2173 | return; |
2174 | // Node's context ids should be the union of both its callee and caller edge |
2175 | // context ids. |
2176 | if (Node->CallerEdges.size()) { |
2177 | auto EI = Node->CallerEdges.begin(); |
2178 | auto &FirstEdge = *EI; |
2179 | EI++; |
2180 | DenseSet<uint32_t> CallerEdgeContextIds(FirstEdge->ContextIds); |
2181 | for (; EI != Node->CallerEdges.end(); EI++) { |
2182 | const auto &Edge = *EI; |
2183 | if (CheckEdges) |
2184 | checkEdge<DerivedCCG, FuncTy, CallTy>(Edge); |
2185 | set_union(CallerEdgeContextIds, Edge->ContextIds); |
2186 | } |
2187 | // Node can have more context ids than callers if some contexts terminate at |
2188 | // node and some are longer. |
2189 | assert(Node->ContextIds == CallerEdgeContextIds || |
2190 | set_is_subset(CallerEdgeContextIds, Node->ContextIds)); |
2191 | } |
2192 | if (Node->CalleeEdges.size()) { |
2193 | auto EI = Node->CalleeEdges.begin(); |
2194 | auto &FirstEdge = *EI; |
2195 | EI++; |
2196 | DenseSet<uint32_t> CalleeEdgeContextIds(FirstEdge->ContextIds); |
2197 | for (; EI != Node->CalleeEdges.end(); EI++) { |
2198 | const auto &Edge = *EI; |
2199 | if (CheckEdges) |
2200 | checkEdge<DerivedCCG, FuncTy, CallTy>(Edge); |
2201 | set_union(CalleeEdgeContextIds, Edge->ContextIds); |
2202 | } |
2203 | assert(Node->ContextIds == CalleeEdgeContextIds); |
2204 | } |
2205 | } |
2206 | |
2207 | template <typename DerivedCCG, typename FuncTy, typename CallTy> |
2208 | void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::check() const { |
2209 | using GraphType = const CallsiteContextGraph<DerivedCCG, FuncTy, CallTy> *; |
2210 | for (const auto Node : nodes<GraphType>(this)) { |
2211 | checkNode<DerivedCCG, FuncTy, CallTy>(Node, /*CheckEdges=*/false); |
2212 | for (auto &Edge : Node->CallerEdges) |
2213 | checkEdge<DerivedCCG, FuncTy, CallTy>(Edge); |
2214 | } |
2215 | } |
2216 | |
2217 | template <typename DerivedCCG, typename FuncTy, typename CallTy> |
2218 | struct GraphTraits<const CallsiteContextGraph<DerivedCCG, FuncTy, CallTy> *> { |
2219 | using GraphType = const CallsiteContextGraph<DerivedCCG, FuncTy, CallTy> *; |
2220 | using NodeRef = const ContextNode<DerivedCCG, FuncTy, CallTy> *; |
2221 | |
2222 | using NodePtrTy = std::unique_ptr<ContextNode<DerivedCCG, FuncTy, CallTy>>; |
2223 | static NodeRef getNode(const NodePtrTy &P) { return P.get(); } |
2224 | |
2225 | using nodes_iterator = |
2226 | mapped_iterator<typename std::vector<NodePtrTy>::const_iterator, |
2227 | decltype(&getNode)>; |
2228 | |
2229 | static nodes_iterator nodes_begin(GraphType G) { |
2230 | return nodes_iterator(G->NodeOwner.begin(), &getNode); |
2231 | } |
2232 | |
2233 | static nodes_iterator nodes_end(GraphType G) { |
2234 | return nodes_iterator(G->NodeOwner.end(), &getNode); |
2235 | } |
2236 | |
2237 | static NodeRef getEntryNode(GraphType G) { |
2238 | return G->NodeOwner.begin()->get(); |
2239 | } |
2240 | |
2241 | using EdgePtrTy = std::shared_ptr<ContextEdge<DerivedCCG, FuncTy, CallTy>>; |
2242 | static const ContextNode<DerivedCCG, FuncTy, CallTy> * |
2243 | GetCallee(const EdgePtrTy &P) { |
2244 | return P->Callee; |
2245 | } |
2246 | |
2247 | using ChildIteratorType = |
2248 | mapped_iterator<typename std::vector<std::shared_ptr<ContextEdge< |
2249 | DerivedCCG, FuncTy, CallTy>>>::const_iterator, |
2250 | decltype(&GetCallee)>; |
2251 | |
2252 | static ChildIteratorType child_begin(NodeRef N) { |
2253 | return ChildIteratorType(N->CalleeEdges.begin(), &GetCallee); |
2254 | } |
2255 | |
2256 | static ChildIteratorType child_end(NodeRef N) { |
2257 | return ChildIteratorType(N->CalleeEdges.end(), &GetCallee); |
2258 | } |
2259 | }; |
2260 | |
2261 | template <typename DerivedCCG, typename FuncTy, typename CallTy> |
2262 | struct DOTGraphTraits<const CallsiteContextGraph<DerivedCCG, FuncTy, CallTy> *> |
2263 | : public DefaultDOTGraphTraits { |
2264 | DOTGraphTraits(bool IsSimple = false) : DefaultDOTGraphTraits(IsSimple) {} |
2265 | |
2266 | using GraphType = const CallsiteContextGraph<DerivedCCG, FuncTy, CallTy> *; |
2267 | using GTraits = GraphTraits<GraphType>; |
2268 | using NodeRef = typename GTraits::NodeRef; |
2269 | using ChildIteratorType = typename GTraits::ChildIteratorType; |
2270 | |
2271 | static std::string getNodeLabel(NodeRef Node, GraphType G) { |
2272 | std::string LabelString = |
2273 | (Twine("OrigId: " ) + (Node->IsAllocation ? "Alloc" : "" ) + |
2274 | Twine(Node->OrigStackOrAllocId)) |
2275 | .str(); |
2276 | LabelString += "\n" ; |
2277 | if (Node->hasCall()) { |
2278 | auto Func = G->NodeToCallingFunc.find(Node); |
2279 | assert(Func != G->NodeToCallingFunc.end()); |
2280 | LabelString += |
2281 | G->getLabel(Func->second, Node->Call.call(), Node->Call.cloneNo()); |
2282 | } else { |
2283 | LabelString += "null call" ; |
2284 | if (Node->Recursive) |
2285 | LabelString += " (recursive)" ; |
2286 | else |
2287 | LabelString += " (external)" ; |
2288 | } |
2289 | return LabelString; |
2290 | } |
2291 | |
2292 | static std::string getNodeAttributes(NodeRef Node, GraphType) { |
2293 | std::string AttributeString = (Twine("tooltip=\"" ) + getNodeId(Node) + " " + |
2294 | getContextIds(ContextIds: Node->ContextIds) + "\"" ) |
2295 | .str(); |
2296 | AttributeString += |
2297 | (Twine(",fillcolor=\"" ) + getColor(AllocTypes: Node->AllocTypes) + "\"" ).str(); |
2298 | AttributeString += ",style=\"filled\"" ; |
2299 | if (Node->CloneOf) { |
2300 | AttributeString += ",color=\"blue\"" ; |
2301 | AttributeString += ",style=\"filled,bold,dashed\"" ; |
2302 | } else |
2303 | AttributeString += ",style=\"filled\"" ; |
2304 | return AttributeString; |
2305 | } |
2306 | |
2307 | static std::string getEdgeAttributes(NodeRef, ChildIteratorType ChildIter, |
2308 | GraphType) { |
2309 | auto &Edge = *(ChildIter.getCurrent()); |
2310 | return (Twine("tooltip=\"" ) + getContextIds(ContextIds: Edge->ContextIds) + "\"" + |
2311 | Twine(",fillcolor=\"" ) + getColor(AllocTypes: Edge->AllocTypes) + "\"" ) |
2312 | .str(); |
2313 | } |
2314 | |
2315 | // Since the NodeOwners list includes nodes that are no longer connected to |
2316 | // the graph, skip them here. |
2317 | static bool isNodeHidden(NodeRef Node, GraphType) { |
2318 | return Node->isRemoved(); |
2319 | } |
2320 | |
2321 | private: |
2322 | static std::string getContextIds(const DenseSet<uint32_t> &ContextIds) { |
2323 | std::string IdString = "ContextIds:" ; |
2324 | if (ContextIds.size() < 100) { |
2325 | std::vector<uint32_t> SortedIds(ContextIds.begin(), ContextIds.end()); |
2326 | std::sort(first: SortedIds.begin(), last: SortedIds.end()); |
2327 | for (auto Id : SortedIds) |
2328 | IdString += (" " + Twine(Id)).str(); |
2329 | } else { |
2330 | IdString += (" (" + Twine(ContextIds.size()) + " ids)" ).str(); |
2331 | } |
2332 | return IdString; |
2333 | } |
2334 | |
2335 | static std::string getColor(uint8_t AllocTypes) { |
2336 | if (AllocTypes == (uint8_t)AllocationType::NotCold) |
2337 | // Color "brown1" actually looks like a lighter red. |
2338 | return "brown1" ; |
2339 | if (AllocTypes == (uint8_t)AllocationType::Cold) |
2340 | return "cyan" ; |
2341 | if (AllocTypes == |
2342 | ((uint8_t)AllocationType::NotCold | (uint8_t)AllocationType::Cold)) |
2343 | // Lighter purple. |
2344 | return "mediumorchid1" ; |
2345 | return "gray" ; |
2346 | } |
2347 | |
2348 | static std::string getNodeId(NodeRef Node) { |
2349 | std::stringstream SStream; |
2350 | SStream << std::hex << "N0x" << (unsigned long long)Node; |
2351 | std::string Result = SStream.str(); |
2352 | return Result; |
2353 | } |
2354 | }; |
2355 | |
2356 | template <typename DerivedCCG, typename FuncTy, typename CallTy> |
2357 | void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::exportToDot( |
2358 | std::string Label) const { |
2359 | WriteGraph(this, "" , false, Label, |
2360 | DotFilePathPrefix + "ccg." + Label + ".dot" ); |
2361 | } |
2362 | |
2363 | template <typename DerivedCCG, typename FuncTy, typename CallTy> |
2364 | typename CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::ContextNode * |
2365 | CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::moveEdgeToNewCalleeClone( |
2366 | const std::shared_ptr<ContextEdge> &Edge, EdgeIter *CallerEdgeI, |
2367 | DenseSet<uint32_t> ContextIdsToMove) { |
2368 | ContextNode *Node = Edge->Callee; |
2369 | NodeOwner.push_back( |
2370 | std::make_unique<ContextNode>(Node->IsAllocation, Node->Call)); |
2371 | ContextNode *Clone = NodeOwner.back().get(); |
2372 | Node->addClone(Clone); |
2373 | assert(NodeToCallingFunc.count(Node)); |
2374 | NodeToCallingFunc[Clone] = NodeToCallingFunc[Node]; |
2375 | moveEdgeToExistingCalleeClone(Edge, NewCallee: Clone, CallerEdgeI, /*NewClone=*/true, |
2376 | ContextIdsToMove); |
2377 | return Clone; |
2378 | } |
2379 | |
2380 | template <typename DerivedCCG, typename FuncTy, typename CallTy> |
2381 | void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>:: |
2382 | moveEdgeToExistingCalleeClone(const std::shared_ptr<ContextEdge> &Edge, |
2383 | ContextNode *NewCallee, EdgeIter *CallerEdgeI, |
2384 | bool NewClone, |
2385 | DenseSet<uint32_t> ContextIdsToMove) { |
2386 | // NewCallee and Edge's current callee must be clones of the same original |
2387 | // node (Edge's current callee may be the original node too). |
2388 | assert(NewCallee->getOrigNode() == Edge->Callee->getOrigNode()); |
2389 | |
2390 | ContextNode *OldCallee = Edge->Callee; |
2391 | |
2392 | // We might already have an edge to the new callee from earlier cloning for a |
2393 | // different allocation. If one exists we will reuse it. |
2394 | auto ExistingEdgeToNewCallee = NewCallee->findEdgeFromCaller(Edge->Caller); |
2395 | |
2396 | // Callers will pass an empty ContextIdsToMove set when they want to move the |
2397 | // edge. Copy in Edge's ids for simplicity. |
2398 | if (ContextIdsToMove.empty()) |
2399 | ContextIdsToMove = Edge->getContextIds(); |
2400 | |
2401 | // If we are moving all of Edge's ids, then just move the whole Edge. |
2402 | // Otherwise only move the specified subset, to a new edge if needed. |
2403 | if (Edge->getContextIds().size() == ContextIdsToMove.size()) { |
2404 | // Moving the whole Edge. |
2405 | if (CallerEdgeI) |
2406 | *CallerEdgeI = OldCallee->CallerEdges.erase(*CallerEdgeI); |
2407 | else |
2408 | OldCallee->eraseCallerEdge(Edge.get()); |
2409 | if (ExistingEdgeToNewCallee) { |
2410 | // Since we already have an edge to NewCallee, simply move the ids |
2411 | // onto it, and remove the existing Edge. |
2412 | ExistingEdgeToNewCallee->getContextIds().insert(ContextIdsToMove.begin(), |
2413 | ContextIdsToMove.end()); |
2414 | ExistingEdgeToNewCallee->AllocTypes |= Edge->AllocTypes; |
2415 | assert(Edge->ContextIds == ContextIdsToMove); |
2416 | Edge->ContextIds.clear(); |
2417 | Edge->AllocTypes = (uint8_t)AllocationType::None; |
2418 | Edge->Caller->eraseCalleeEdge(Edge.get()); |
2419 | } else { |
2420 | // Otherwise just reconnect Edge to NewCallee. |
2421 | Edge->Callee = NewCallee; |
2422 | NewCallee->CallerEdges.push_back(Edge); |
2423 | // Don't need to update Edge's context ids since we are simply |
2424 | // reconnecting it. |
2425 | } |
2426 | // In either case, need to update the alloc types on New Callee. |
2427 | NewCallee->AllocTypes |= Edge->AllocTypes; |
2428 | } else { |
2429 | // Only moving a subset of Edge's ids. |
2430 | if (CallerEdgeI) |
2431 | ++CallerEdgeI; |
2432 | // Compute the alloc type of the subset of ids being moved. |
2433 | auto CallerEdgeAllocType = computeAllocType(ContextIds&: ContextIdsToMove); |
2434 | if (ExistingEdgeToNewCallee) { |
2435 | // Since we already have an edge to NewCallee, simply move the ids |
2436 | // onto it. |
2437 | ExistingEdgeToNewCallee->getContextIds().insert(ContextIdsToMove.begin(), |
2438 | ContextIdsToMove.end()); |
2439 | ExistingEdgeToNewCallee->AllocTypes |= CallerEdgeAllocType; |
2440 | } else { |
2441 | // Otherwise, create a new edge to NewCallee for the ids being moved. |
2442 | auto NewEdge = std::make_shared<ContextEdge>( |
2443 | NewCallee, Edge->Caller, CallerEdgeAllocType, ContextIdsToMove); |
2444 | Edge->Caller->CalleeEdges.push_back(NewEdge); |
2445 | NewCallee->CallerEdges.push_back(NewEdge); |
2446 | } |
2447 | // In either case, need to update the alloc types on NewCallee, and remove |
2448 | // those ids and update the alloc type on the original Edge. |
2449 | NewCallee->AllocTypes |= CallerEdgeAllocType; |
2450 | set_subtract(Edge->ContextIds, ContextIdsToMove); |
2451 | Edge->AllocTypes = computeAllocType(ContextIds&: Edge->ContextIds); |
2452 | } |
2453 | // Now perform some updates that are common to all cases: the NewCallee gets |
2454 | // the moved ids added, and we need to remove those ids from OldCallee and |
2455 | // update its alloc type (NewCallee alloc type updates handled above). |
2456 | NewCallee->ContextIds.insert(ContextIdsToMove.begin(), |
2457 | ContextIdsToMove.end()); |
2458 | set_subtract(OldCallee->ContextIds, ContextIdsToMove); |
2459 | OldCallee->AllocTypes = computeAllocType(ContextIds&: OldCallee->ContextIds); |
2460 | // OldCallee alloc type should be None iff its context id set is now empty. |
2461 | assert((OldCallee->AllocTypes == (uint8_t)AllocationType::None) == |
2462 | OldCallee->ContextIds.empty()); |
2463 | // Now walk the old callee node's callee edges and move Edge's context ids |
2464 | // over to the corresponding edge into the clone (which is created here if |
2465 | // this is a newly created clone). |
2466 | for (auto &OldCalleeEdge : OldCallee->CalleeEdges) { |
2467 | // The context ids moving to the new callee are the subset of this edge's |
2468 | // context ids and the context ids on the caller edge being moved. |
2469 | DenseSet<uint32_t> EdgeContextIdsToMove = |
2470 | set_intersection(OldCalleeEdge->getContextIds(), ContextIdsToMove); |
2471 | set_subtract(OldCalleeEdge->getContextIds(), EdgeContextIdsToMove); |
2472 | OldCalleeEdge->AllocTypes = |
2473 | computeAllocType(ContextIds&: OldCalleeEdge->getContextIds()); |
2474 | if (!NewClone) { |
2475 | // Update context ids / alloc type on corresponding edge to NewCallee. |
2476 | // There is a chance this may not exist if we are reusing an existing |
2477 | // clone, specifically during function assignment, where we would have |
2478 | // removed none type edges after creating the clone. If we can't find |
2479 | // a corresponding edge there, fall through to the cloning below. |
2480 | if (auto *NewCalleeEdge = |
2481 | NewCallee->findEdgeFromCallee(OldCalleeEdge->Callee)) { |
2482 | NewCalleeEdge->getContextIds().insert(EdgeContextIdsToMove.begin(), |
2483 | EdgeContextIdsToMove.end()); |
2484 | NewCalleeEdge->AllocTypes |= computeAllocType(ContextIds&: EdgeContextIdsToMove); |
2485 | continue; |
2486 | } |
2487 | } |
2488 | auto NewEdge = std::make_shared<ContextEdge>( |
2489 | OldCalleeEdge->Callee, NewCallee, |
2490 | computeAllocType(ContextIds&: EdgeContextIdsToMove), EdgeContextIdsToMove); |
2491 | NewCallee->CalleeEdges.push_back(NewEdge); |
2492 | NewEdge->Callee->CallerEdges.push_back(NewEdge); |
2493 | } |
2494 | if (VerifyCCG) { |
2495 | checkNode<DerivedCCG, FuncTy, CallTy>(OldCallee, /*CheckEdges=*/false); |
2496 | checkNode<DerivedCCG, FuncTy, CallTy>(NewCallee, /*CheckEdges=*/false); |
2497 | for (const auto &OldCalleeEdge : OldCallee->CalleeEdges) |
2498 | checkNode<DerivedCCG, FuncTy, CallTy>(OldCalleeEdge->Callee, |
2499 | /*CheckEdges=*/false); |
2500 | for (const auto &NewCalleeEdge : NewCallee->CalleeEdges) |
2501 | checkNode<DerivedCCG, FuncTy, CallTy>(NewCalleeEdge->Callee, |
2502 | /*CheckEdges=*/false); |
2503 | } |
2504 | } |
2505 | |
2506 | template <typename DerivedCCG, typename FuncTy, typename CallTy> |
2507 | void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>:: |
2508 | recursivelyRemoveNoneTypeCalleeEdges( |
2509 | ContextNode *Node, DenseSet<const ContextNode *> &Visited) { |
2510 | auto Inserted = Visited.insert(Node); |
2511 | if (!Inserted.second) |
2512 | return; |
2513 | |
2514 | removeNoneTypeCalleeEdges(Node); |
2515 | |
2516 | for (auto *Clone : Node->Clones) |
2517 | recursivelyRemoveNoneTypeCalleeEdges(Node: Clone, Visited); |
2518 | |
2519 | // The recursive call may remove some of this Node's caller edges. |
2520 | // Iterate over a copy and skip any that were removed. |
2521 | auto CallerEdges = Node->CallerEdges; |
2522 | for (auto &Edge : CallerEdges) { |
2523 | // Skip any that have been removed by an earlier recursive call. |
2524 | if (Edge->Callee == nullptr && Edge->Caller == nullptr) { |
2525 | assert(!std::count(Node->CallerEdges.begin(), Node->CallerEdges.end(), |
2526 | Edge)); |
2527 | continue; |
2528 | } |
2529 | recursivelyRemoveNoneTypeCalleeEdges(Node: Edge->Caller, Visited); |
2530 | } |
2531 | } |
2532 | |
2533 | template <typename DerivedCCG, typename FuncTy, typename CallTy> |
2534 | void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::identifyClones() { |
2535 | DenseSet<const ContextNode *> Visited; |
2536 | for (auto &Entry : AllocationCallToContextNodeMap) { |
2537 | Visited.clear(); |
2538 | identifyClones(Entry.second, Visited, Entry.second->ContextIds); |
2539 | } |
2540 | Visited.clear(); |
2541 | for (auto &Entry : AllocationCallToContextNodeMap) |
2542 | recursivelyRemoveNoneTypeCalleeEdges(Node: Entry.second, Visited); |
2543 | if (VerifyCCG) |
2544 | check(); |
2545 | } |
2546 | |
2547 | // helper function to check an AllocType is cold or notcold or both. |
2548 | bool checkColdOrNotCold(uint8_t AllocType) { |
2549 | return (AllocType == (uint8_t)AllocationType::Cold) || |
2550 | (AllocType == (uint8_t)AllocationType::NotCold) || |
2551 | (AllocType == |
2552 | ((uint8_t)AllocationType::Cold | (uint8_t)AllocationType::NotCold)); |
2553 | } |
2554 | |
2555 | template <typename DerivedCCG, typename FuncTy, typename CallTy> |
2556 | void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::identifyClones( |
2557 | ContextNode *Node, DenseSet<const ContextNode *> &Visited, |
2558 | const DenseSet<uint32_t> &AllocContextIds) { |
2559 | if (VerifyNodes) |
2560 | checkNode<DerivedCCG, FuncTy, CallTy>(Node, /*CheckEdges=*/false); |
2561 | assert(!Node->CloneOf); |
2562 | |
2563 | // If Node as a null call, then either it wasn't found in the module (regular |
2564 | // LTO) or summary index (ThinLTO), or there were other conditions blocking |
2565 | // cloning (e.g. recursion, calls multiple targets, etc). |
2566 | // Do this here so that we don't try to recursively clone callers below, which |
2567 | // isn't useful at least for this node. |
2568 | if (!Node->hasCall()) |
2569 | return; |
2570 | |
2571 | #ifndef NDEBUG |
2572 | auto Insert = |
2573 | #endif |
2574 | Visited.insert(Node); |
2575 | // We should not have visited this node yet. |
2576 | assert(Insert.second); |
2577 | // The recursive call to identifyClones may delete the current edge from the |
2578 | // CallerEdges vector. Make a copy and iterate on that, simpler than passing |
2579 | // in an iterator and having recursive call erase from it. Other edges may |
2580 | // also get removed during the recursion, which will have null Callee and |
2581 | // Caller pointers (and are deleted later), so we skip those below. |
2582 | { |
2583 | auto CallerEdges = Node->CallerEdges; |
2584 | for (auto &Edge : CallerEdges) { |
2585 | // Skip any that have been removed by an earlier recursive call. |
2586 | if (Edge->Callee == nullptr && Edge->Caller == nullptr) { |
2587 | assert(!llvm::count(Node->CallerEdges, Edge)); |
2588 | continue; |
2589 | } |
2590 | // Ignore any caller we previously visited via another edge. |
2591 | if (!Visited.count(Edge->Caller) && !Edge->Caller->CloneOf) { |
2592 | identifyClones(Edge->Caller, Visited, AllocContextIds); |
2593 | } |
2594 | } |
2595 | } |
2596 | |
2597 | // Check if we reached an unambiguous call or have have only a single caller. |
2598 | if (hasSingleAllocType(Node->AllocTypes) || Node->CallerEdges.size() <= 1) |
2599 | return; |
2600 | |
2601 | // We need to clone. |
2602 | |
2603 | // Try to keep the original version as alloc type NotCold. This will make |
2604 | // cases with indirect calls or any other situation with an unknown call to |
2605 | // the original function get the default behavior. We do this by sorting the |
2606 | // CallerEdges of the Node we will clone by alloc type. |
2607 | // |
2608 | // Give NotCold edge the lowest sort priority so those edges are at the end of |
2609 | // the caller edges vector, and stay on the original version (since the below |
2610 | // code clones greedily until it finds all remaining edges have the same type |
2611 | // and leaves the remaining ones on the original Node). |
2612 | // |
2613 | // We shouldn't actually have any None type edges, so the sorting priority for |
2614 | // that is arbitrary, and we assert in that case below. |
2615 | const unsigned AllocTypeCloningPriority[] = {/*None*/ 3, /*NotCold*/ 4, |
2616 | /*Cold*/ 1, |
2617 | /*NotColdCold*/ 2}; |
2618 | std::stable_sort(Node->CallerEdges.begin(), Node->CallerEdges.end(), |
2619 | [&](const std::shared_ptr<ContextEdge> &A, |
2620 | const std::shared_ptr<ContextEdge> &B) { |
2621 | // Nodes with non-empty context ids should be sorted before |
2622 | // those with empty context ids. |
2623 | if (A->ContextIds.empty()) |
2624 | // Either B ContextIds are non-empty (in which case we |
2625 | // should return false because B < A), or B ContextIds |
2626 | // are empty, in which case they are equal, and we should |
2627 | // maintain the original relative ordering. |
2628 | return false; |
2629 | if (B->ContextIds.empty()) |
2630 | return true; |
2631 | |
2632 | if (A->AllocTypes == B->AllocTypes) |
2633 | // Use the first context id for each edge as a |
2634 | // tie-breaker. |
2635 | return *A->ContextIds.begin() < *B->ContextIds.begin(); |
2636 | return AllocTypeCloningPriority[A->AllocTypes] < |
2637 | AllocTypeCloningPriority[B->AllocTypes]; |
2638 | }); |
2639 | |
2640 | assert(Node->AllocTypes != (uint8_t)AllocationType::None); |
2641 | |
2642 | // Iterate until we find no more opportunities for disambiguating the alloc |
2643 | // types via cloning. In most cases this loop will terminate once the Node |
2644 | // has a single allocation type, in which case no more cloning is needed. |
2645 | // We need to be able to remove Edge from CallerEdges, so need to adjust |
2646 | // iterator inside the loop. |
2647 | for (auto EI = Node->CallerEdges.begin(); EI != Node->CallerEdges.end();) { |
2648 | auto CallerEdge = *EI; |
2649 | |
2650 | // See if cloning the prior caller edge left this node with a single alloc |
2651 | // type or a single caller. In that case no more cloning of Node is needed. |
2652 | if (hasSingleAllocType(Node->AllocTypes) || Node->CallerEdges.size() <= 1) |
2653 | break; |
2654 | |
2655 | // Only need to process the ids along this edge pertaining to the given |
2656 | // allocation. |
2657 | auto CallerEdgeContextsForAlloc = |
2658 | set_intersection(CallerEdge->getContextIds(), AllocContextIds); |
2659 | if (CallerEdgeContextsForAlloc.empty()) { |
2660 | ++EI; |
2661 | continue; |
2662 | } |
2663 | auto CallerAllocTypeForAlloc = computeAllocType(ContextIds&: CallerEdgeContextsForAlloc); |
2664 | |
2665 | // Compute the node callee edge alloc types corresponding to the context ids |
2666 | // for this caller edge. |
2667 | std::vector<uint8_t> CalleeEdgeAllocTypesForCallerEdge; |
2668 | CalleeEdgeAllocTypesForCallerEdge.reserve(n: Node->CalleeEdges.size()); |
2669 | for (auto &CalleeEdge : Node->CalleeEdges) |
2670 | CalleeEdgeAllocTypesForCallerEdge.push_back(intersectAllocTypes( |
2671 | Node1Ids: CalleeEdge->getContextIds(), Node2Ids: CallerEdgeContextsForAlloc)); |
2672 | |
2673 | // Don't clone if doing so will not disambiguate any alloc types amongst |
2674 | // caller edges (including the callee edges that would be cloned). |
2675 | // Otherwise we will simply move all edges to the clone. |
2676 | // |
2677 | // First check if by cloning we will disambiguate the caller allocation |
2678 | // type from node's allocation type. Query allocTypeToUse so that we don't |
2679 | // bother cloning to distinguish NotCold+Cold from NotCold. Note that |
2680 | // neither of these should be None type. |
2681 | // |
2682 | // Then check if by cloning node at least one of the callee edges will be |
2683 | // disambiguated by splitting out different context ids. |
2684 | assert(CallerEdge->AllocTypes != (uint8_t)AllocationType::None); |
2685 | assert(Node->AllocTypes != (uint8_t)AllocationType::None); |
2686 | if (allocTypeToUse(CallerAllocTypeForAlloc) == |
2687 | allocTypeToUse(Node->AllocTypes) && |
2688 | allocTypesMatch<DerivedCCG, FuncTy, CallTy>( |
2689 | CalleeEdgeAllocTypesForCallerEdge, Node->CalleeEdges)) { |
2690 | ++EI; |
2691 | continue; |
2692 | } |
2693 | |
2694 | // First see if we can use an existing clone. Check each clone and its |
2695 | // callee edges for matching alloc types. |
2696 | ContextNode *Clone = nullptr; |
2697 | for (auto *CurClone : Node->Clones) { |
2698 | if (allocTypeToUse(CurClone->AllocTypes) != |
2699 | allocTypeToUse(CallerAllocTypeForAlloc)) |
2700 | continue; |
2701 | |
2702 | if (!allocTypesMatch<DerivedCCG, FuncTy, CallTy>( |
2703 | CalleeEdgeAllocTypesForCallerEdge, CurClone->CalleeEdges)) |
2704 | continue; |
2705 | Clone = CurClone; |
2706 | break; |
2707 | } |
2708 | |
2709 | // The edge iterator is adjusted when we move the CallerEdge to the clone. |
2710 | if (Clone) |
2711 | moveEdgeToExistingCalleeClone(Edge: CallerEdge, NewCallee: Clone, CallerEdgeI: &EI, /*NewClone=*/false, |
2712 | ContextIdsToMove: CallerEdgeContextsForAlloc); |
2713 | else |
2714 | Clone = |
2715 | moveEdgeToNewCalleeClone(Edge: CallerEdge, CallerEdgeI: &EI, ContextIdsToMove: CallerEdgeContextsForAlloc); |
2716 | |
2717 | assert(EI == Node->CallerEdges.end() || |
2718 | Node->AllocTypes != (uint8_t)AllocationType::None); |
2719 | // Sanity check that no alloc types on clone or its edges are None. |
2720 | assert(Clone->AllocTypes != (uint8_t)AllocationType::None); |
2721 | } |
2722 | |
2723 | // We should still have some context ids on the original Node. |
2724 | assert(!Node->ContextIds.empty()); |
2725 | |
2726 | // Sanity check that no alloc types on node or edges are None. |
2727 | assert(Node->AllocTypes != (uint8_t)AllocationType::None); |
2728 | |
2729 | if (VerifyNodes) |
2730 | checkNode<DerivedCCG, FuncTy, CallTy>(Node, /*CheckEdges=*/false); |
2731 | } |
2732 | |
2733 | void ModuleCallsiteContextGraph::updateAllocationCall( |
2734 | CallInfo &Call, AllocationType AllocType) { |
2735 | std::string AllocTypeString = getAllocTypeAttributeString(Type: AllocType); |
2736 | auto A = llvm::Attribute::get(Context&: Call.call()->getFunction()->getContext(), |
2737 | Kind: "memprof" , Val: AllocTypeString); |
2738 | cast<CallBase>(Val: Call.call())->addFnAttr(Attr: A); |
2739 | OREGetter(Call.call()->getFunction()) |
2740 | .emit(OptDiag&: OptimizationRemark(DEBUG_TYPE, "MemprofAttribute" , Call.call()) |
2741 | << ore::NV("AllocationCall" , Call.call()) << " in clone " |
2742 | << ore::NV("Caller" , Call.call()->getFunction()) |
2743 | << " marked with memprof allocation attribute " |
2744 | << ore::NV("Attribute" , AllocTypeString)); |
2745 | } |
2746 | |
2747 | void IndexCallsiteContextGraph::updateAllocationCall(CallInfo &Call, |
2748 | AllocationType AllocType) { |
2749 | auto *AI = Call.call().dyn_cast<AllocInfo *>(); |
2750 | assert(AI); |
2751 | assert(AI->Versions.size() > Call.cloneNo()); |
2752 | AI->Versions[Call.cloneNo()] = (uint8_t)AllocType; |
2753 | } |
2754 | |
2755 | void ModuleCallsiteContextGraph::updateCall(CallInfo &CallerCall, |
2756 | FuncInfo CalleeFunc) { |
2757 | if (CalleeFunc.cloneNo() > 0) |
2758 | cast<CallBase>(Val: CallerCall.call())->setCalledFunction(CalleeFunc.func()); |
2759 | OREGetter(CallerCall.call()->getFunction()) |
2760 | .emit(OptDiag&: OptimizationRemark(DEBUG_TYPE, "MemprofCall" , CallerCall.call()) |
2761 | << ore::NV("Call" , CallerCall.call()) << " in clone " |
2762 | << ore::NV("Caller" , CallerCall.call()->getFunction()) |
2763 | << " assigned to call function clone " |
2764 | << ore::NV("Callee" , CalleeFunc.func())); |
2765 | } |
2766 | |
2767 | void IndexCallsiteContextGraph::updateCall(CallInfo &CallerCall, |
2768 | FuncInfo CalleeFunc) { |
2769 | auto *CI = CallerCall.call().dyn_cast<CallsiteInfo *>(); |
2770 | assert(CI && |
2771 | "Caller cannot be an allocation which should not have profiled calls" ); |
2772 | assert(CI->Clones.size() > CallerCall.cloneNo()); |
2773 | CI->Clones[CallerCall.cloneNo()] = CalleeFunc.cloneNo(); |
2774 | } |
2775 | |
2776 | CallsiteContextGraph<ModuleCallsiteContextGraph, Function, |
2777 | Instruction *>::FuncInfo |
2778 | ModuleCallsiteContextGraph::cloneFunctionForCallsite( |
2779 | FuncInfo &Func, CallInfo &Call, std::map<CallInfo, CallInfo> &CallMap, |
2780 | std::vector<CallInfo> &CallsWithMetadataInFunc, unsigned CloneNo) { |
2781 | // Use existing LLVM facilities for cloning and obtaining Call in clone |
2782 | ValueToValueMapTy VMap; |
2783 | auto *NewFunc = CloneFunction(F: Func.func(), VMap); |
2784 | std::string Name = getMemProfFuncName(Base: Func.func()->getName(), CloneNo); |
2785 | assert(!Func.func()->getParent()->getFunction(Name)); |
2786 | NewFunc->setName(Name); |
2787 | for (auto &Inst : CallsWithMetadataInFunc) { |
2788 | // This map always has the initial version in it. |
2789 | assert(Inst.cloneNo() == 0); |
2790 | CallMap[Inst] = {cast<Instruction>(Val&: VMap[Inst.call()]), CloneNo}; |
2791 | } |
2792 | OREGetter(Func.func()) |
2793 | .emit(OptDiag&: OptimizationRemark(DEBUG_TYPE, "MemprofClone" , Func.func()) |
2794 | << "created clone " << ore::NV("NewFunction" , NewFunc)); |
2795 | return {NewFunc, CloneNo}; |
2796 | } |
2797 | |
2798 | CallsiteContextGraph<IndexCallsiteContextGraph, FunctionSummary, |
2799 | IndexCall>::FuncInfo |
2800 | IndexCallsiteContextGraph::cloneFunctionForCallsite( |
2801 | FuncInfo &Func, CallInfo &Call, std::map<CallInfo, CallInfo> &CallMap, |
2802 | std::vector<CallInfo> &CallsWithMetadataInFunc, unsigned CloneNo) { |
2803 | // Check how many clones we have of Call (and therefore function). |
2804 | // The next clone number is the current size of versions array. |
2805 | // Confirm this matches the CloneNo provided by the caller, which is based on |
2806 | // the number of function clones we have. |
2807 | assert(CloneNo == |
2808 | (Call.call().is<AllocInfo *>() |
2809 | ? Call.call().dyn_cast<AllocInfo *>()->Versions.size() |
2810 | : Call.call().dyn_cast<CallsiteInfo *>()->Clones.size())); |
2811 | // Walk all the instructions in this function. Create a new version for |
2812 | // each (by adding an entry to the Versions/Clones summary array), and copy |
2813 | // over the version being called for the function clone being cloned here. |
2814 | // Additionally, add an entry to the CallMap for the new function clone, |
2815 | // mapping the original call (clone 0, what is in CallsWithMetadataInFunc) |
2816 | // to the new call clone. |
2817 | for (auto &Inst : CallsWithMetadataInFunc) { |
2818 | // This map always has the initial version in it. |
2819 | assert(Inst.cloneNo() == 0); |
2820 | if (auto *AI = Inst.call().dyn_cast<AllocInfo *>()) { |
2821 | assert(AI->Versions.size() == CloneNo); |
2822 | // We assign the allocation type later (in updateAllocationCall), just add |
2823 | // an entry for it here. |
2824 | AI->Versions.push_back(Elt: 0); |
2825 | } else { |
2826 | auto *CI = Inst.call().dyn_cast<CallsiteInfo *>(); |
2827 | assert(CI && CI->Clones.size() == CloneNo); |
2828 | // We assign the clone number later (in updateCall), just add an entry for |
2829 | // it here. |
2830 | CI->Clones.push_back(Elt: 0); |
2831 | } |
2832 | CallMap[Inst] = {Inst.call(), CloneNo}; |
2833 | } |
2834 | return {Func.func(), CloneNo}; |
2835 | } |
2836 | |
2837 | // This method assigns cloned callsites to functions, cloning the functions as |
2838 | // needed. The assignment is greedy and proceeds roughly as follows: |
2839 | // |
2840 | // For each function Func: |
2841 | // For each call with graph Node having clones: |
2842 | // Initialize ClonesWorklist to Node and its clones |
2843 | // Initialize NodeCloneCount to 0 |
2844 | // While ClonesWorklist is not empty: |
2845 | // Clone = pop front ClonesWorklist |
2846 | // NodeCloneCount++ |
2847 | // If Func has been cloned less than NodeCloneCount times: |
2848 | // If NodeCloneCount is 1: |
2849 | // Assign Clone to original Func |
2850 | // Continue |
2851 | // Create a new function clone |
2852 | // If other callers not assigned to call a function clone yet: |
2853 | // Assign them to call new function clone |
2854 | // Continue |
2855 | // Assign any other caller calling the cloned version to new clone |
2856 | // |
2857 | // For each caller of Clone: |
2858 | // If caller is assigned to call a specific function clone: |
2859 | // If we cannot assign Clone to that function clone: |
2860 | // Create new callsite Clone NewClone |
2861 | // Add NewClone to ClonesWorklist |
2862 | // Continue |
2863 | // Assign Clone to existing caller's called function clone |
2864 | // Else: |
2865 | // If Clone not already assigned to a function clone: |
2866 | // Assign to first function clone without assignment |
2867 | // Assign caller to selected function clone |
2868 | template <typename DerivedCCG, typename FuncTy, typename CallTy> |
2869 | bool CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::assignFunctions() { |
2870 | bool Changed = false; |
2871 | |
2872 | // Keep track of the assignment of nodes (callsites) to function clones they |
2873 | // call. |
2874 | DenseMap<ContextNode *, FuncInfo> CallsiteToCalleeFuncCloneMap; |
2875 | |
2876 | // Update caller node to call function version CalleeFunc, by recording the |
2877 | // assignment in CallsiteToCalleeFuncCloneMap. |
2878 | auto RecordCalleeFuncOfCallsite = [&](ContextNode *Caller, |
2879 | const FuncInfo &CalleeFunc) { |
2880 | assert(Caller->hasCall()); |
2881 | CallsiteToCalleeFuncCloneMap[Caller] = CalleeFunc; |
2882 | }; |
2883 | |
2884 | // Walk all functions for which we saw calls with memprof metadata, and handle |
2885 | // cloning for each of its calls. |
2886 | for (auto &[Func, CallsWithMetadata] : FuncToCallsWithMetadata) { |
2887 | FuncInfo OrigFunc(Func); |
2888 | // Map from each clone of OrigFunc to a map of remappings of each call of |
2889 | // interest (from original uncloned call to the corresponding cloned call in |
2890 | // that function clone). |
2891 | std::map<FuncInfo, std::map<CallInfo, CallInfo>> FuncClonesToCallMap; |
2892 | for (auto &Call : CallsWithMetadata) { |
2893 | ContextNode *Node = getNodeForInst(C: Call); |
2894 | // Skip call if we do not have a node for it (all uses of its stack ids |
2895 | // were either on inlined chains or pruned from the MIBs), or if we did |
2896 | // not create any clones for it. |
2897 | if (!Node || Node->Clones.empty()) |
2898 | continue; |
2899 | assert(Node->hasCall() && |
2900 | "Not having a call should have prevented cloning" ); |
2901 | |
2902 | // Track the assignment of function clones to clones of the current |
2903 | // callsite Node being handled. |
2904 | std::map<FuncInfo, ContextNode *> FuncCloneToCurNodeCloneMap; |
2905 | |
2906 | // Assign callsite version CallsiteClone to function version FuncClone, |
2907 | // and also assign (possibly cloned) Call to CallsiteClone. |
2908 | auto AssignCallsiteCloneToFuncClone = [&](const FuncInfo &FuncClone, |
2909 | CallInfo &Call, |
2910 | ContextNode *CallsiteClone, |
2911 | bool IsAlloc) { |
2912 | // Record the clone of callsite node assigned to this function clone. |
2913 | FuncCloneToCurNodeCloneMap[FuncClone] = CallsiteClone; |
2914 | |
2915 | assert(FuncClonesToCallMap.count(FuncClone)); |
2916 | std::map<CallInfo, CallInfo> &CallMap = FuncClonesToCallMap[FuncClone]; |
2917 | CallInfo CallClone(Call); |
2918 | if (CallMap.count(Call)) |
2919 | CallClone = CallMap[Call]; |
2920 | CallsiteClone->setCall(CallClone); |
2921 | }; |
2922 | |
2923 | // Keep track of the clones of callsite Node that need to be assigned to |
2924 | // function clones. This list may be expanded in the loop body below if we |
2925 | // find additional cloning is required. |
2926 | std::deque<ContextNode *> ClonesWorklist; |
2927 | // Ignore original Node if we moved all of its contexts to clones. |
2928 | if (!Node->ContextIds.empty()) |
2929 | ClonesWorklist.push_back(Node); |
2930 | ClonesWorklist.insert(ClonesWorklist.end(), Node->Clones.begin(), |
2931 | Node->Clones.end()); |
2932 | |
2933 | // Now walk through all of the clones of this callsite Node that we need, |
2934 | // and determine the assignment to a corresponding clone of the current |
2935 | // function (creating new function clones as needed). |
2936 | unsigned NodeCloneCount = 0; |
2937 | while (!ClonesWorklist.empty()) { |
2938 | ContextNode *Clone = ClonesWorklist.front(); |
2939 | ClonesWorklist.pop_front(); |
2940 | NodeCloneCount++; |
2941 | if (VerifyNodes) |
2942 | checkNode<DerivedCCG, FuncTy, CallTy>(Clone); |
2943 | |
2944 | // Need to create a new function clone if we have more callsite clones |
2945 | // than existing function clones, which would have been assigned to an |
2946 | // earlier clone in the list (we assign callsite clones to function |
2947 | // clones greedily). |
2948 | if (FuncClonesToCallMap.size() < NodeCloneCount) { |
2949 | // If this is the first callsite copy, assign to original function. |
2950 | if (NodeCloneCount == 1) { |
2951 | // Since FuncClonesToCallMap is empty in this case, no clones have |
2952 | // been created for this function yet, and no callers should have |
2953 | // been assigned a function clone for this callee node yet. |
2954 | assert(llvm::none_of( |
2955 | Clone->CallerEdges, [&](const std::shared_ptr<ContextEdge> &E) { |
2956 | return CallsiteToCalleeFuncCloneMap.count(E->Caller); |
2957 | })); |
2958 | // Initialize with empty call map, assign Clone to original function |
2959 | // and its callers, and skip to the next clone. |
2960 | FuncClonesToCallMap[OrigFunc] = {}; |
2961 | AssignCallsiteCloneToFuncClone( |
2962 | OrigFunc, Call, Clone, |
2963 | AllocationCallToContextNodeMap.count(Call)); |
2964 | for (auto &CE : Clone->CallerEdges) { |
2965 | // Ignore any caller that does not have a recorded callsite Call. |
2966 | if (!CE->Caller->hasCall()) |
2967 | continue; |
2968 | RecordCalleeFuncOfCallsite(CE->Caller, OrigFunc); |
2969 | } |
2970 | continue; |
2971 | } |
2972 | |
2973 | // First locate which copy of OrigFunc to clone again. If a caller |
2974 | // of this callsite clone was already assigned to call a particular |
2975 | // function clone, we need to redirect all of those callers to the |
2976 | // new function clone, and update their other callees within this |
2977 | // function. |
2978 | FuncInfo PreviousAssignedFuncClone; |
2979 | auto EI = llvm::find_if( |
2980 | Clone->CallerEdges, [&](const std::shared_ptr<ContextEdge> &E) { |
2981 | return CallsiteToCalleeFuncCloneMap.count(E->Caller); |
2982 | }); |
2983 | bool CallerAssignedToCloneOfFunc = false; |
2984 | if (EI != Clone->CallerEdges.end()) { |
2985 | const std::shared_ptr<ContextEdge> &Edge = *EI; |
2986 | PreviousAssignedFuncClone = |
2987 | CallsiteToCalleeFuncCloneMap[Edge->Caller]; |
2988 | CallerAssignedToCloneOfFunc = true; |
2989 | } |
2990 | |
2991 | // Clone function and save it along with the CallInfo map created |
2992 | // during cloning in the FuncClonesToCallMap. |
2993 | std::map<CallInfo, CallInfo> NewCallMap; |
2994 | unsigned CloneNo = FuncClonesToCallMap.size(); |
2995 | assert(CloneNo > 0 && "Clone 0 is the original function, which " |
2996 | "should already exist in the map" ); |
2997 | FuncInfo NewFuncClone = cloneFunctionForCallsite( |
2998 | Func&: OrigFunc, Call, CallMap&: NewCallMap, CallsWithMetadataInFunc&: CallsWithMetadata, CloneNo); |
2999 | FuncClonesToCallMap.emplace(NewFuncClone, std::move(NewCallMap)); |
3000 | FunctionClonesAnalysis++; |
3001 | Changed = true; |
3002 | |
3003 | // If no caller callsites were already assigned to a clone of this |
3004 | // function, we can simply assign this clone to the new func clone |
3005 | // and update all callers to it, then skip to the next clone. |
3006 | if (!CallerAssignedToCloneOfFunc) { |
3007 | AssignCallsiteCloneToFuncClone( |
3008 | NewFuncClone, Call, Clone, |
3009 | AllocationCallToContextNodeMap.count(Call)); |
3010 | for (auto &CE : Clone->CallerEdges) { |
3011 | // Ignore any caller that does not have a recorded callsite Call. |
3012 | if (!CE->Caller->hasCall()) |
3013 | continue; |
3014 | RecordCalleeFuncOfCallsite(CE->Caller, NewFuncClone); |
3015 | } |
3016 | continue; |
3017 | } |
3018 | |
3019 | // We may need to do additional node cloning in this case. |
3020 | // Reset the CallsiteToCalleeFuncCloneMap entry for any callers |
3021 | // that were previously assigned to call PreviousAssignedFuncClone, |
3022 | // to record that they now call NewFuncClone. |
3023 | for (auto CE : Clone->CallerEdges) { |
3024 | // Skip any that have been removed on an earlier iteration. |
3025 | if (!CE) |
3026 | continue; |
3027 | // Ignore any caller that does not have a recorded callsite Call. |
3028 | if (!CE->Caller->hasCall()) |
3029 | continue; |
3030 | |
3031 | if (!CallsiteToCalleeFuncCloneMap.count(CE->Caller) || |
3032 | // We subsequently fall through to later handling that |
3033 | // will perform any additional cloning required for |
3034 | // callers that were calling other function clones. |
3035 | CallsiteToCalleeFuncCloneMap[CE->Caller] != |
3036 | PreviousAssignedFuncClone) |
3037 | continue; |
3038 | |
3039 | RecordCalleeFuncOfCallsite(CE->Caller, NewFuncClone); |
3040 | |
3041 | // If we are cloning a function that was already assigned to some |
3042 | // callers, then essentially we are creating new callsite clones |
3043 | // of the other callsites in that function that are reached by those |
3044 | // callers. Clone the other callees of the current callsite's caller |
3045 | // that were already assigned to PreviousAssignedFuncClone |
3046 | // accordingly. This is important since we subsequently update the |
3047 | // calls from the nodes in the graph and their assignments to callee |
3048 | // functions recorded in CallsiteToCalleeFuncCloneMap. |
3049 | for (auto CalleeEdge : CE->Caller->CalleeEdges) { |
3050 | // Skip any that have been removed on an earlier iteration when |
3051 | // cleaning up newly None type callee edges. |
3052 | if (!CalleeEdge) |
3053 | continue; |
3054 | ContextNode *Callee = CalleeEdge->Callee; |
3055 | // Skip the current callsite, we are looking for other |
3056 | // callsites Caller calls, as well as any that does not have a |
3057 | // recorded callsite Call. |
3058 | if (Callee == Clone || !Callee->hasCall()) |
3059 | continue; |
3060 | ContextNode *NewClone = moveEdgeToNewCalleeClone(Edge: CalleeEdge); |
3061 | removeNoneTypeCalleeEdges(Node: NewClone); |
3062 | // Moving the edge may have resulted in some none type |
3063 | // callee edges on the original Callee. |
3064 | removeNoneTypeCalleeEdges(Node: Callee); |
3065 | assert(NewClone->AllocTypes != (uint8_t)AllocationType::None); |
3066 | // If the Callee node was already assigned to call a specific |
3067 | // function version, make sure its new clone is assigned to call |
3068 | // that same function clone. |
3069 | if (CallsiteToCalleeFuncCloneMap.count(Callee)) |
3070 | RecordCalleeFuncOfCallsite( |
3071 | NewClone, CallsiteToCalleeFuncCloneMap[Callee]); |
3072 | // Update NewClone with the new Call clone of this callsite's Call |
3073 | // created for the new function clone created earlier. |
3074 | // Recall that we have already ensured when building the graph |
3075 | // that each caller can only call callsites within the same |
3076 | // function, so we are guaranteed that Callee Call is in the |
3077 | // current OrigFunc. |
3078 | // CallMap is set up as indexed by original Call at clone 0. |
3079 | CallInfo OrigCall(Callee->getOrigNode()->Call); |
3080 | OrigCall.setCloneNo(0); |
3081 | std::map<CallInfo, CallInfo> &CallMap = |
3082 | FuncClonesToCallMap[NewFuncClone]; |
3083 | assert(CallMap.count(OrigCall)); |
3084 | CallInfo NewCall(CallMap[OrigCall]); |
3085 | assert(NewCall); |
3086 | NewClone->setCall(NewCall); |
3087 | } |
3088 | } |
3089 | // Fall through to handling below to perform the recording of the |
3090 | // function for this callsite clone. This enables handling of cases |
3091 | // where the callers were assigned to different clones of a function. |
3092 | } |
3093 | |
3094 | // See if we can use existing function clone. Walk through |
3095 | // all caller edges to see if any have already been assigned to |
3096 | // a clone of this callsite's function. If we can use it, do so. If not, |
3097 | // because that function clone is already assigned to a different clone |
3098 | // of this callsite, then we need to clone again. |
3099 | // Basically, this checking is needed to handle the case where different |
3100 | // caller functions/callsites may need versions of this function |
3101 | // containing different mixes of callsite clones across the different |
3102 | // callsites within the function. If that happens, we need to create |
3103 | // additional function clones to handle the various combinations. |
3104 | // |
3105 | // Keep track of any new clones of this callsite created by the |
3106 | // following loop, as well as any existing clone that we decided to |
3107 | // assign this clone to. |
3108 | std::map<FuncInfo, ContextNode *> FuncCloneToNewCallsiteCloneMap; |
3109 | FuncInfo FuncCloneAssignedToCurCallsiteClone; |
3110 | // We need to be able to remove Edge from CallerEdges, so need to adjust |
3111 | // iterator in the loop. |
3112 | for (auto EI = Clone->CallerEdges.begin(); |
3113 | EI != Clone->CallerEdges.end();) { |
3114 | auto Edge = *EI; |
3115 | // Ignore any caller that does not have a recorded callsite Call. |
3116 | if (!Edge->Caller->hasCall()) { |
3117 | EI++; |
3118 | continue; |
3119 | } |
3120 | // If this caller already assigned to call a version of OrigFunc, need |
3121 | // to ensure we can assign this callsite clone to that function clone. |
3122 | if (CallsiteToCalleeFuncCloneMap.count(Edge->Caller)) { |
3123 | FuncInfo FuncCloneCalledByCaller = |
3124 | CallsiteToCalleeFuncCloneMap[Edge->Caller]; |
3125 | // First we need to confirm that this function clone is available |
3126 | // for use by this callsite node clone. |
3127 | // |
3128 | // While FuncCloneToCurNodeCloneMap is built only for this Node and |
3129 | // its callsite clones, one of those callsite clones X could have |
3130 | // been assigned to the same function clone called by Edge's caller |
3131 | // - if Edge's caller calls another callsite within Node's original |
3132 | // function, and that callsite has another caller reaching clone X. |
3133 | // We need to clone Node again in this case. |
3134 | if ((FuncCloneToCurNodeCloneMap.count(FuncCloneCalledByCaller) && |
3135 | FuncCloneToCurNodeCloneMap[FuncCloneCalledByCaller] != |
3136 | Clone) || |
3137 | // Detect when we have multiple callers of this callsite that |
3138 | // have already been assigned to specific, and different, clones |
3139 | // of OrigFunc (due to other unrelated callsites in Func they |
3140 | // reach via call contexts). Is this Clone of callsite Node |
3141 | // assigned to a different clone of OrigFunc? If so, clone Node |
3142 | // again. |
3143 | (FuncCloneAssignedToCurCallsiteClone && |
3144 | FuncCloneAssignedToCurCallsiteClone != |
3145 | FuncCloneCalledByCaller)) { |
3146 | // We need to use a different newly created callsite clone, in |
3147 | // order to assign it to another new function clone on a |
3148 | // subsequent iteration over the Clones array (adjusted below). |
3149 | // Note we specifically do not reset the |
3150 | // CallsiteToCalleeFuncCloneMap entry for this caller, so that |
3151 | // when this new clone is processed later we know which version of |
3152 | // the function to copy (so that other callsite clones we have |
3153 | // assigned to that function clone are properly cloned over). See |
3154 | // comments in the function cloning handling earlier. |
3155 | |
3156 | // Check if we already have cloned this callsite again while |
3157 | // walking through caller edges, for a caller calling the same |
3158 | // function clone. If so, we can move this edge to that new clone |
3159 | // rather than creating yet another new clone. |
3160 | if (FuncCloneToNewCallsiteCloneMap.count( |
3161 | FuncCloneCalledByCaller)) { |
3162 | ContextNode *NewClone = |
3163 | FuncCloneToNewCallsiteCloneMap[FuncCloneCalledByCaller]; |
3164 | moveEdgeToExistingCalleeClone(Edge, NewCallee: NewClone, CallerEdgeI: &EI); |
3165 | // Cleanup any none type edges cloned over. |
3166 | removeNoneTypeCalleeEdges(Node: NewClone); |
3167 | } else { |
3168 | // Create a new callsite clone. |
3169 | ContextNode *NewClone = moveEdgeToNewCalleeClone(Edge, CallerEdgeI: &EI); |
3170 | removeNoneTypeCalleeEdges(Node: NewClone); |
3171 | FuncCloneToNewCallsiteCloneMap[FuncCloneCalledByCaller] = |
3172 | NewClone; |
3173 | // Add to list of clones and process later. |
3174 | ClonesWorklist.push_back(NewClone); |
3175 | assert(EI == Clone->CallerEdges.end() || |
3176 | Clone->AllocTypes != (uint8_t)AllocationType::None); |
3177 | assert(NewClone->AllocTypes != (uint8_t)AllocationType::None); |
3178 | } |
3179 | // Moving the caller edge may have resulted in some none type |
3180 | // callee edges. |
3181 | removeNoneTypeCalleeEdges(Node: Clone); |
3182 | // We will handle the newly created callsite clone in a subsequent |
3183 | // iteration over this Node's Clones. Continue here since we |
3184 | // already adjusted iterator EI while moving the edge. |
3185 | continue; |
3186 | } |
3187 | |
3188 | // Otherwise, we can use the function clone already assigned to this |
3189 | // caller. |
3190 | if (!FuncCloneAssignedToCurCallsiteClone) { |
3191 | FuncCloneAssignedToCurCallsiteClone = FuncCloneCalledByCaller; |
3192 | // Assign Clone to FuncCloneCalledByCaller |
3193 | AssignCallsiteCloneToFuncClone( |
3194 | FuncCloneCalledByCaller, Call, Clone, |
3195 | AllocationCallToContextNodeMap.count(Call)); |
3196 | } else |
3197 | // Don't need to do anything - callsite is already calling this |
3198 | // function clone. |
3199 | assert(FuncCloneAssignedToCurCallsiteClone == |
3200 | FuncCloneCalledByCaller); |
3201 | |
3202 | } else { |
3203 | // We have not already assigned this caller to a version of |
3204 | // OrigFunc. Do the assignment now. |
3205 | |
3206 | // First check if we have already assigned this callsite clone to a |
3207 | // clone of OrigFunc for another caller during this iteration over |
3208 | // its caller edges. |
3209 | if (!FuncCloneAssignedToCurCallsiteClone) { |
3210 | // Find first function in FuncClonesToCallMap without an assigned |
3211 | // clone of this callsite Node. We should always have one |
3212 | // available at this point due to the earlier cloning when the |
3213 | // FuncClonesToCallMap size was smaller than the clone number. |
3214 | for (auto &CF : FuncClonesToCallMap) { |
3215 | if (!FuncCloneToCurNodeCloneMap.count(CF.first)) { |
3216 | FuncCloneAssignedToCurCallsiteClone = CF.first; |
3217 | break; |
3218 | } |
3219 | } |
3220 | assert(FuncCloneAssignedToCurCallsiteClone); |
3221 | // Assign Clone to FuncCloneAssignedToCurCallsiteClone |
3222 | AssignCallsiteCloneToFuncClone( |
3223 | FuncCloneAssignedToCurCallsiteClone, Call, Clone, |
3224 | AllocationCallToContextNodeMap.count(Call)); |
3225 | } else |
3226 | assert(FuncCloneToCurNodeCloneMap |
3227 | [FuncCloneAssignedToCurCallsiteClone] == Clone); |
3228 | // Update callers to record function version called. |
3229 | RecordCalleeFuncOfCallsite(Edge->Caller, |
3230 | FuncCloneAssignedToCurCallsiteClone); |
3231 | } |
3232 | |
3233 | EI++; |
3234 | } |
3235 | } |
3236 | if (VerifyCCG) { |
3237 | checkNode<DerivedCCG, FuncTy, CallTy>(Node); |
3238 | for (const auto &PE : Node->CalleeEdges) |
3239 | checkNode<DerivedCCG, FuncTy, CallTy>(PE->Callee); |
3240 | for (const auto &CE : Node->CallerEdges) |
3241 | checkNode<DerivedCCG, FuncTy, CallTy>(CE->Caller); |
3242 | for (auto *Clone : Node->Clones) { |
3243 | checkNode<DerivedCCG, FuncTy, CallTy>(Clone); |
3244 | for (const auto &PE : Clone->CalleeEdges) |
3245 | checkNode<DerivedCCG, FuncTy, CallTy>(PE->Callee); |
3246 | for (const auto &CE : Clone->CallerEdges) |
3247 | checkNode<DerivedCCG, FuncTy, CallTy>(CE->Caller); |
3248 | } |
3249 | } |
3250 | } |
3251 | } |
3252 | |
3253 | auto UpdateCalls = [&](ContextNode *Node, |
3254 | DenseSet<const ContextNode *> &Visited, |
3255 | auto &&UpdateCalls) { |
3256 | auto Inserted = Visited.insert(Node); |
3257 | if (!Inserted.second) |
3258 | return; |
3259 | |
3260 | for (auto *Clone : Node->Clones) |
3261 | UpdateCalls(Clone, Visited, UpdateCalls); |
3262 | |
3263 | for (auto &Edge : Node->CallerEdges) |
3264 | UpdateCalls(Edge->Caller, Visited, UpdateCalls); |
3265 | |
3266 | // Skip if either no call to update, or if we ended up with no context ids |
3267 | // (we moved all edges onto other clones). |
3268 | if (!Node->hasCall() || Node->ContextIds.empty()) |
3269 | return; |
3270 | |
3271 | if (Node->IsAllocation) { |
3272 | updateAllocationCall(Call&: Node->Call, AllocType: allocTypeToUse(Node->AllocTypes)); |
3273 | return; |
3274 | } |
3275 | |
3276 | if (!CallsiteToCalleeFuncCloneMap.count(Node)) |
3277 | return; |
3278 | |
3279 | auto CalleeFunc = CallsiteToCalleeFuncCloneMap[Node]; |
3280 | updateCall(CallerCall&: Node->Call, CalleeFunc); |
3281 | }; |
3282 | |
3283 | // Performs DFS traversal starting from allocation nodes to update calls to |
3284 | // reflect cloning decisions recorded earlier. For regular LTO this will |
3285 | // update the actual calls in the IR to call the appropriate function clone |
3286 | // (and add attributes to allocation calls), whereas for ThinLTO the decisions |
3287 | // are recorded in the summary entries. |
3288 | DenseSet<const ContextNode *> Visited; |
3289 | for (auto &Entry : AllocationCallToContextNodeMap) |
3290 | UpdateCalls(Entry.second, Visited, UpdateCalls); |
3291 | |
3292 | return Changed; |
3293 | } |
3294 | |
3295 | static SmallVector<std::unique_ptr<ValueToValueMapTy>, 4> ( |
3296 | Function &F, unsigned NumClones, Module &M, OptimizationRemarkEmitter &ORE, |
3297 | std::map<const Function *, SmallPtrSet<const GlobalAlias *, 1>> |
3298 | &FuncToAliasMap) { |
3299 | // The first "clone" is the original copy, we should only call this if we |
3300 | // needed to create new clones. |
3301 | assert(NumClones > 1); |
3302 | SmallVector<std::unique_ptr<ValueToValueMapTy>, 4> VMaps; |
3303 | VMaps.reserve(N: NumClones - 1); |
3304 | FunctionsClonedThinBackend++; |
3305 | for (unsigned I = 1; I < NumClones; I++) { |
3306 | VMaps.emplace_back(Args: std::make_unique<ValueToValueMapTy>()); |
3307 | auto *NewF = CloneFunction(F: &F, VMap&: *VMaps.back()); |
3308 | FunctionClonesThinBackend++; |
3309 | // Strip memprof and callsite metadata from clone as they are no longer |
3310 | // needed. |
3311 | for (auto &BB : *NewF) { |
3312 | for (auto &Inst : BB) { |
3313 | Inst.setMetadata(KindID: LLVMContext::MD_memprof, Node: nullptr); |
3314 | Inst.setMetadata(KindID: LLVMContext::MD_callsite, Node: nullptr); |
3315 | } |
3316 | } |
3317 | std::string Name = getMemProfFuncName(Base: F.getName(), CloneNo: I); |
3318 | auto *PrevF = M.getFunction(Name); |
3319 | if (PrevF) { |
3320 | // We might have created this when adjusting callsite in another |
3321 | // function. It should be a declaration. |
3322 | assert(PrevF->isDeclaration()); |
3323 | NewF->takeName(V: PrevF); |
3324 | PrevF->replaceAllUsesWith(V: NewF); |
3325 | PrevF->eraseFromParent(); |
3326 | } else |
3327 | NewF->setName(Name); |
3328 | ORE.emit(OptDiag&: OptimizationRemark(DEBUG_TYPE, "MemprofClone" , &F) |
3329 | << "created clone " << ore::NV("NewFunction" , NewF)); |
3330 | |
3331 | // Now handle aliases to this function, and clone those as well. |
3332 | if (!FuncToAliasMap.count(x: &F)) |
3333 | continue; |
3334 | for (auto *A : FuncToAliasMap[&F]) { |
3335 | std::string Name = getMemProfFuncName(Base: A->getName(), CloneNo: I); |
3336 | auto *PrevA = M.getNamedAlias(Name); |
3337 | auto *NewA = GlobalAlias::create(Ty: A->getValueType(), |
3338 | AddressSpace: A->getType()->getPointerAddressSpace(), |
3339 | Linkage: A->getLinkage(), Name, Aliasee: NewF); |
3340 | NewA->copyAttributesFrom(Src: A); |
3341 | if (PrevA) { |
3342 | // We might have created this when adjusting callsite in another |
3343 | // function. It should be a declaration. |
3344 | assert(PrevA->isDeclaration()); |
3345 | NewA->takeName(V: PrevA); |
3346 | PrevA->replaceAllUsesWith(V: NewA); |
3347 | PrevA->eraseFromParent(); |
3348 | } |
3349 | } |
3350 | } |
3351 | return VMaps; |
3352 | } |
3353 | |
3354 | // Locate the summary for F. This is complicated by the fact that it might |
3355 | // have been internalized or promoted. |
3356 | static ValueInfo findValueInfoForFunc(const Function &F, const Module &M, |
3357 | const ModuleSummaryIndex *ImportSummary) { |
3358 | // FIXME: Ideally we would retain the original GUID in some fashion on the |
3359 | // function (e.g. as metadata), but for now do our best to locate the |
3360 | // summary without that information. |
3361 | ValueInfo TheFnVI = ImportSummary->getValueInfo(GUID: F.getGUID()); |
3362 | if (!TheFnVI) |
3363 | // See if theFn was internalized, by checking index directly with |
3364 | // original name (this avoids the name adjustment done by getGUID() for |
3365 | // internal symbols). |
3366 | TheFnVI = ImportSummary->getValueInfo(GUID: GlobalValue::getGUID(GlobalName: F.getName())); |
3367 | if (TheFnVI) |
3368 | return TheFnVI; |
3369 | // Now query with the original name before any promotion was performed. |
3370 | StringRef OrigName = |
3371 | ModuleSummaryIndex::getOriginalNameBeforePromote(Name: F.getName()); |
3372 | std::string OrigId = GlobalValue::getGlobalIdentifier( |
3373 | Name: OrigName, Linkage: GlobalValue::InternalLinkage, FileName: M.getSourceFileName()); |
3374 | TheFnVI = ImportSummary->getValueInfo(GUID: GlobalValue::getGUID(GlobalName: OrigId)); |
3375 | if (TheFnVI) |
3376 | return TheFnVI; |
3377 | // Could be a promoted local imported from another module. We need to pass |
3378 | // down more info here to find the original module id. For now, try with |
3379 | // the OrigName which might have been stored in the OidGuidMap in the |
3380 | // index. This would not work if there were same-named locals in multiple |
3381 | // modules, however. |
3382 | auto OrigGUID = |
3383 | ImportSummary->getGUIDFromOriginalID(OriginalID: GlobalValue::getGUID(GlobalName: OrigName)); |
3384 | if (OrigGUID) |
3385 | TheFnVI = ImportSummary->getValueInfo(GUID: OrigGUID); |
3386 | return TheFnVI; |
3387 | } |
3388 | |
3389 | bool MemProfContextDisambiguation::applyImport(Module &M) { |
3390 | assert(ImportSummary); |
3391 | bool Changed = false; |
3392 | |
3393 | auto IsMemProfClone = [](const Function &F) { |
3394 | return F.getName().contains(Other: MemProfCloneSuffix); |
3395 | }; |
3396 | |
3397 | // We also need to clone any aliases that reference cloned functions, because |
3398 | // the modified callsites may invoke via the alias. Keep track of the aliases |
3399 | // for each function. |
3400 | std::map<const Function *, SmallPtrSet<const GlobalAlias *, 1>> |
3401 | FuncToAliasMap; |
3402 | for (auto &A : M.aliases()) { |
3403 | auto *Aliasee = A.getAliaseeObject(); |
3404 | if (auto *F = dyn_cast<Function>(Val: Aliasee)) |
3405 | FuncToAliasMap[F].insert(Ptr: &A); |
3406 | } |
3407 | |
3408 | for (auto &F : M) { |
3409 | if (F.isDeclaration() || IsMemProfClone(F)) |
3410 | continue; |
3411 | |
3412 | OptimizationRemarkEmitter ORE(&F); |
3413 | |
3414 | SmallVector<std::unique_ptr<ValueToValueMapTy>, 4> VMaps; |
3415 | bool ClonesCreated = false; |
3416 | unsigned NumClonesCreated = 0; |
3417 | auto CloneFuncIfNeeded = [&](unsigned NumClones) { |
3418 | // We should at least have version 0 which is the original copy. |
3419 | assert(NumClones > 0); |
3420 | // If only one copy needed use original. |
3421 | if (NumClones == 1) |
3422 | return; |
3423 | // If we already performed cloning of this function, confirm that the |
3424 | // requested number of clones matches (the thin link should ensure the |
3425 | // number of clones for each constituent callsite is consistent within |
3426 | // each function), before returning. |
3427 | if (ClonesCreated) { |
3428 | assert(NumClonesCreated == NumClones); |
3429 | return; |
3430 | } |
3431 | VMaps = createFunctionClones(F, NumClones, M, ORE, FuncToAliasMap); |
3432 | // The first "clone" is the original copy, which doesn't have a VMap. |
3433 | assert(VMaps.size() == NumClones - 1); |
3434 | Changed = true; |
3435 | ClonesCreated = true; |
3436 | NumClonesCreated = NumClones; |
3437 | }; |
3438 | |
3439 | auto CloneCallsite = [&](const CallsiteInfo &StackNode, CallBase *CB, |
3440 | Function *CalledFunction) { |
3441 | // Perform cloning if not yet done. |
3442 | CloneFuncIfNeeded(/*NumClones=*/StackNode.Clones.size()); |
3443 | |
3444 | // Should have skipped indirect calls via mayHaveMemprofSummary. |
3445 | assert(CalledFunction); |
3446 | assert(!IsMemProfClone(*CalledFunction)); |
3447 | |
3448 | // Update the calls per the summary info. |
3449 | // Save orig name since it gets updated in the first iteration |
3450 | // below. |
3451 | auto CalleeOrigName = CalledFunction->getName(); |
3452 | for (unsigned J = 0; J < StackNode.Clones.size(); J++) { |
3453 | // Do nothing if this version calls the original version of its |
3454 | // callee. |
3455 | if (!StackNode.Clones[J]) |
3456 | continue; |
3457 | auto NewF = M.getOrInsertFunction( |
3458 | Name: getMemProfFuncName(Base: CalleeOrigName, CloneNo: StackNode.Clones[J]), |
3459 | T: CalledFunction->getFunctionType()); |
3460 | CallBase *CBClone; |
3461 | // Copy 0 is the original function. |
3462 | if (!J) |
3463 | CBClone = CB; |
3464 | else |
3465 | CBClone = cast<CallBase>(Val&: (*VMaps[J - 1])[CB]); |
3466 | CBClone->setCalledFunction(NewF); |
3467 | ORE.emit(OptDiag&: OptimizationRemark(DEBUG_TYPE, "MemprofCall" , CBClone) |
3468 | << ore::NV("Call" , CBClone) << " in clone " |
3469 | << ore::NV("Caller" , CBClone->getFunction()) |
3470 | << " assigned to call function clone " |
3471 | << ore::NV("Callee" , NewF.getCallee())); |
3472 | } |
3473 | }; |
3474 | |
3475 | // Locate the summary for F. |
3476 | ValueInfo TheFnVI = findValueInfoForFunc(F, M, ImportSummary); |
3477 | // If not found, this could be an imported local (see comment in |
3478 | // findValueInfoForFunc). Skip for now as it will be cloned in its original |
3479 | // module (where it would have been promoted to global scope so should |
3480 | // satisfy any reference in this module). |
3481 | if (!TheFnVI) |
3482 | continue; |
3483 | |
3484 | auto *GVSummary = |
3485 | ImportSummary->findSummaryInModule(VI: TheFnVI, ModuleId: M.getModuleIdentifier()); |
3486 | if (!GVSummary) { |
3487 | // Must have been imported, use the summary which matches the definition。 |
3488 | // (might be multiple if this was a linkonce_odr). |
3489 | auto SrcModuleMD = F.getMetadata(Kind: "thinlto_src_module" ); |
3490 | assert(SrcModuleMD && |
3491 | "enable-import-metadata is needed to emit thinlto_src_module" ); |
3492 | StringRef SrcModule = |
3493 | dyn_cast<MDString>(Val: SrcModuleMD->getOperand(I: 0))->getString(); |
3494 | for (auto &GVS : TheFnVI.getSummaryList()) { |
3495 | if (GVS->modulePath() == SrcModule) { |
3496 | GVSummary = GVS.get(); |
3497 | break; |
3498 | } |
3499 | } |
3500 | assert(GVSummary && GVSummary->modulePath() == SrcModule); |
3501 | } |
3502 | |
3503 | // If this was an imported alias skip it as we won't have the function |
3504 | // summary, and it should be cloned in the original module. |
3505 | if (isa<AliasSummary>(Val: GVSummary)) |
3506 | continue; |
3507 | |
3508 | auto *FS = cast<FunctionSummary>(Val: GVSummary->getBaseObject()); |
3509 | |
3510 | if (FS->allocs().empty() && FS->callsites().empty()) |
3511 | continue; |
3512 | |
3513 | auto SI = FS->callsites().begin(); |
3514 | auto AI = FS->allocs().begin(); |
3515 | |
3516 | // To handle callsite infos synthesized for tail calls which have missing |
3517 | // frames in the profiled context, map callee VI to the synthesized callsite |
3518 | // info. |
3519 | DenseMap<ValueInfo, CallsiteInfo> MapTailCallCalleeVIToCallsite; |
3520 | // Iterate the callsites for this function in reverse, since we place all |
3521 | // those synthesized for tail calls at the end. |
3522 | for (auto CallsiteIt = FS->callsites().rbegin(); |
3523 | CallsiteIt != FS->callsites().rend(); CallsiteIt++) { |
3524 | auto &Callsite = *CallsiteIt; |
3525 | // Stop as soon as we see a non-synthesized callsite info (see comment |
3526 | // above loop). All the entries added for discovered tail calls have empty |
3527 | // stack ids. |
3528 | if (!Callsite.StackIdIndices.empty()) |
3529 | break; |
3530 | MapTailCallCalleeVIToCallsite.insert(KV: {Callsite.Callee, Callsite}); |
3531 | } |
3532 | |
3533 | // Assume for now that the instructions are in the exact same order |
3534 | // as when the summary was created, but confirm this is correct by |
3535 | // matching the stack ids. |
3536 | for (auto &BB : F) { |
3537 | for (auto &I : BB) { |
3538 | auto *CB = dyn_cast<CallBase>(Val: &I); |
3539 | // Same handling as when creating module summary. |
3540 | if (!mayHaveMemprofSummary(CB)) |
3541 | continue; |
3542 | |
3543 | auto *CalledValue = CB->getCalledOperand(); |
3544 | auto *CalledFunction = CB->getCalledFunction(); |
3545 | if (CalledValue && !CalledFunction) { |
3546 | CalledValue = CalledValue->stripPointerCasts(); |
3547 | // Stripping pointer casts can reveal a called function. |
3548 | CalledFunction = dyn_cast<Function>(Val: CalledValue); |
3549 | } |
3550 | // Check if this is an alias to a function. If so, get the |
3551 | // called aliasee for the checks below. |
3552 | if (auto *GA = dyn_cast<GlobalAlias>(Val: CalledValue)) { |
3553 | assert(!CalledFunction && |
3554 | "Expected null called function in callsite for alias" ); |
3555 | CalledFunction = dyn_cast<Function>(Val: GA->getAliaseeObject()); |
3556 | } |
3557 | |
3558 | CallStack<MDNode, MDNode::op_iterator> CallsiteContext( |
3559 | I.getMetadata(KindID: LLVMContext::MD_callsite)); |
3560 | auto *MemProfMD = I.getMetadata(KindID: LLVMContext::MD_memprof); |
3561 | |
3562 | // Include allocs that were already assigned a memprof function |
3563 | // attribute in the statistics. |
3564 | if (CB->getAttributes().hasFnAttr(Kind: "memprof" )) { |
3565 | assert(!MemProfMD); |
3566 | CB->getAttributes().getFnAttr(Kind: "memprof" ).getValueAsString() == "cold" |
3567 | ? AllocTypeColdThinBackend++ |
3568 | : AllocTypeNotColdThinBackend++; |
3569 | OrigAllocsThinBackend++; |
3570 | AllocVersionsThinBackend++; |
3571 | if (!MaxAllocVersionsThinBackend) |
3572 | MaxAllocVersionsThinBackend = 1; |
3573 | // Remove any remaining callsite metadata and we can skip the rest of |
3574 | // the handling for this instruction, since no cloning needed. |
3575 | I.setMetadata(KindID: LLVMContext::MD_callsite, Node: nullptr); |
3576 | continue; |
3577 | } |
3578 | |
3579 | if (MemProfMD) { |
3580 | // Consult the next alloc node. |
3581 | assert(AI != FS->allocs().end()); |
3582 | auto &AllocNode = *(AI++); |
3583 | |
3584 | // Sanity check that the MIB stack ids match between the summary and |
3585 | // instruction metadata. |
3586 | auto MIBIter = AllocNode.MIBs.begin(); |
3587 | for (auto &MDOp : MemProfMD->operands()) { |
3588 | assert(MIBIter != AllocNode.MIBs.end()); |
3589 | LLVM_ATTRIBUTE_UNUSED auto StackIdIndexIter = |
3590 | MIBIter->StackIdIndices.begin(); |
3591 | auto *MIBMD = cast<const MDNode>(Val: MDOp); |
3592 | MDNode *StackMDNode = getMIBStackNode(MIB: MIBMD); |
3593 | assert(StackMDNode); |
3594 | CallStack<MDNode, MDNode::op_iterator> StackContext(StackMDNode); |
3595 | auto ContextIterBegin = |
3596 | StackContext.beginAfterSharedPrefix(Other&: CallsiteContext); |
3597 | // Skip the checking on the first iteration. |
3598 | uint64_t LastStackContextId = |
3599 | (ContextIterBegin != StackContext.end() && |
3600 | *ContextIterBegin == 0) |
3601 | ? 1 |
3602 | : 0; |
3603 | for (auto ContextIter = ContextIterBegin; |
3604 | ContextIter != StackContext.end(); ++ContextIter) { |
3605 | // If this is a direct recursion, simply skip the duplicate |
3606 | // entries, to be consistent with how the summary ids were |
3607 | // generated during ModuleSummaryAnalysis. |
3608 | if (LastStackContextId == *ContextIter) |
3609 | continue; |
3610 | LastStackContextId = *ContextIter; |
3611 | assert(StackIdIndexIter != MIBIter->StackIdIndices.end()); |
3612 | assert(ImportSummary->getStackIdAtIndex(*StackIdIndexIter) == |
3613 | *ContextIter); |
3614 | StackIdIndexIter++; |
3615 | } |
3616 | MIBIter++; |
3617 | } |
3618 | |
3619 | // Perform cloning if not yet done. |
3620 | CloneFuncIfNeeded(/*NumClones=*/AllocNode.Versions.size()); |
3621 | |
3622 | OrigAllocsThinBackend++; |
3623 | AllocVersionsThinBackend += AllocNode.Versions.size(); |
3624 | if (MaxAllocVersionsThinBackend < AllocNode.Versions.size()) |
3625 | MaxAllocVersionsThinBackend = AllocNode.Versions.size(); |
3626 | |
3627 | // If there is only one version that means we didn't end up |
3628 | // considering this function for cloning, and in that case the alloc |
3629 | // will still be none type or should have gotten the default NotCold. |
3630 | // Skip that after calling clone helper since that does some sanity |
3631 | // checks that confirm we haven't decided yet that we need cloning. |
3632 | if (AllocNode.Versions.size() == 1) { |
3633 | assert((AllocationType)AllocNode.Versions[0] == |
3634 | AllocationType::NotCold || |
3635 | (AllocationType)AllocNode.Versions[0] == |
3636 | AllocationType::None); |
3637 | UnclonableAllocsThinBackend++; |
3638 | continue; |
3639 | } |
3640 | |
3641 | // All versions should have a singular allocation type. |
3642 | assert(llvm::none_of(AllocNode.Versions, [](uint8_t Type) { |
3643 | return Type == ((uint8_t)AllocationType::NotCold | |
3644 | (uint8_t)AllocationType::Cold); |
3645 | })); |
3646 | |
3647 | // Update the allocation types per the summary info. |
3648 | for (unsigned J = 0; J < AllocNode.Versions.size(); J++) { |
3649 | // Ignore any that didn't get an assigned allocation type. |
3650 | if (AllocNode.Versions[J] == (uint8_t)AllocationType::None) |
3651 | continue; |
3652 | AllocationType AllocTy = (AllocationType)AllocNode.Versions[J]; |
3653 | AllocTy == AllocationType::Cold ? AllocTypeColdThinBackend++ |
3654 | : AllocTypeNotColdThinBackend++; |
3655 | std::string AllocTypeString = getAllocTypeAttributeString(Type: AllocTy); |
3656 | auto A = llvm::Attribute::get(Context&: F.getContext(), Kind: "memprof" , |
3657 | Val: AllocTypeString); |
3658 | CallBase *CBClone; |
3659 | // Copy 0 is the original function. |
3660 | if (!J) |
3661 | CBClone = CB; |
3662 | else |
3663 | // Since VMaps are only created for new clones, we index with |
3664 | // clone J-1 (J==0 is the original clone and does not have a VMaps |
3665 | // entry). |
3666 | CBClone = cast<CallBase>(Val&: (*VMaps[J - 1])[CB]); |
3667 | CBClone->addFnAttr(Attr: A); |
3668 | ORE.emit(OptDiag&: OptimizationRemark(DEBUG_TYPE, "MemprofAttribute" , CBClone) |
3669 | << ore::NV("AllocationCall" , CBClone) << " in clone " |
3670 | << ore::NV("Caller" , CBClone->getFunction()) |
3671 | << " marked with memprof allocation attribute " |
3672 | << ore::NV("Attribute" , AllocTypeString)); |
3673 | } |
3674 | } else if (!CallsiteContext.empty()) { |
3675 | // Consult the next callsite node. |
3676 | assert(SI != FS->callsites().end()); |
3677 | auto &StackNode = *(SI++); |
3678 | |
3679 | #ifndef NDEBUG |
3680 | // Sanity check that the stack ids match between the summary and |
3681 | // instruction metadata. |
3682 | auto StackIdIndexIter = StackNode.StackIdIndices.begin(); |
3683 | for (auto StackId : CallsiteContext) { |
3684 | assert(StackIdIndexIter != StackNode.StackIdIndices.end()); |
3685 | assert(ImportSummary->getStackIdAtIndex(*StackIdIndexIter) == |
3686 | StackId); |
3687 | StackIdIndexIter++; |
3688 | } |
3689 | #endif |
3690 | |
3691 | CloneCallsite(StackNode, CB, CalledFunction); |
3692 | } else if (CB->isTailCall()) { |
3693 | // Locate the synthesized callsite info for the callee VI, if any was |
3694 | // created, and use that for cloning. |
3695 | ValueInfo CalleeVI = |
3696 | findValueInfoForFunc(F: *CalledFunction, M, ImportSummary); |
3697 | if (CalleeVI && MapTailCallCalleeVIToCallsite.count(Val: CalleeVI)) { |
3698 | auto Callsite = MapTailCallCalleeVIToCallsite.find(Val: CalleeVI); |
3699 | assert(Callsite != MapTailCallCalleeVIToCallsite.end()); |
3700 | CloneCallsite(Callsite->second, CB, CalledFunction); |
3701 | } |
3702 | } |
3703 | // Memprof and callsite metadata on memory allocations no longer needed. |
3704 | I.setMetadata(KindID: LLVMContext::MD_memprof, Node: nullptr); |
3705 | I.setMetadata(KindID: LLVMContext::MD_callsite, Node: nullptr); |
3706 | } |
3707 | } |
3708 | } |
3709 | |
3710 | return Changed; |
3711 | } |
3712 | |
3713 | template <typename DerivedCCG, typename FuncTy, typename CallTy> |
3714 | bool CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::process() { |
3715 | if (DumpCCG) { |
3716 | dbgs() << "CCG before cloning:\n" ; |
3717 | dbgs() << *this; |
3718 | } |
3719 | if (ExportToDot) |
3720 | exportToDot(Label: "postbuild" ); |
3721 | |
3722 | if (VerifyCCG) { |
3723 | check(); |
3724 | } |
3725 | |
3726 | identifyClones(); |
3727 | |
3728 | if (VerifyCCG) { |
3729 | check(); |
3730 | } |
3731 | |
3732 | if (DumpCCG) { |
3733 | dbgs() << "CCG after cloning:\n" ; |
3734 | dbgs() << *this; |
3735 | } |
3736 | if (ExportToDot) |
3737 | exportToDot(Label: "cloned" ); |
3738 | |
3739 | bool Changed = assignFunctions(); |
3740 | |
3741 | if (DumpCCG) { |
3742 | dbgs() << "CCG after assigning function clones:\n" ; |
3743 | dbgs() << *this; |
3744 | } |
3745 | if (ExportToDot) |
3746 | exportToDot(Label: "clonefuncassign" ); |
3747 | |
3748 | return Changed; |
3749 | } |
3750 | |
3751 | bool MemProfContextDisambiguation::( |
3752 | Module &M, |
3753 | llvm::function_ref<OptimizationRemarkEmitter &(Function *)> OREGetter) { |
3754 | |
3755 | // If we have an import summary, then the cloning decisions were made during |
3756 | // the thin link on the index. Apply them and return. |
3757 | if (ImportSummary) |
3758 | return applyImport(M); |
3759 | |
3760 | // TODO: If/when other types of memprof cloning are enabled beyond just for |
3761 | // hot and cold, we will need to change this to individually control the |
3762 | // AllocationType passed to addStackNodesForMIB during CCG construction. |
3763 | // Note that we specifically check this after applying imports above, so that |
3764 | // the option isn't needed to be passed to distributed ThinLTO backend |
3765 | // clang processes, which won't necessarily have visibility into the linker |
3766 | // dependences. Instead the information is communicated from the LTO link to |
3767 | // the backends via the combined summary index. |
3768 | if (!SupportsHotColdNew) |
3769 | return false; |
3770 | |
3771 | ModuleCallsiteContextGraph CCG(M, OREGetter); |
3772 | return CCG.process(); |
3773 | } |
3774 | |
3775 | MemProfContextDisambiguation::MemProfContextDisambiguation( |
3776 | const ModuleSummaryIndex *Summary) |
3777 | : ImportSummary(Summary) { |
3778 | if (ImportSummary) { |
3779 | // The MemProfImportSummary should only be used for testing ThinLTO |
3780 | // distributed backend handling via opt, in which case we don't have a |
3781 | // summary from the pass pipeline. |
3782 | assert(MemProfImportSummary.empty()); |
3783 | return; |
3784 | } |
3785 | if (MemProfImportSummary.empty()) |
3786 | return; |
3787 | |
3788 | auto ReadSummaryFile = |
3789 | errorOrToExpected(EO: MemoryBuffer::getFile(Filename: MemProfImportSummary)); |
3790 | if (!ReadSummaryFile) { |
3791 | logAllUnhandledErrors(E: ReadSummaryFile.takeError(), OS&: errs(), |
3792 | ErrorBanner: "Error loading file '" + MemProfImportSummary + |
3793 | "': " ); |
3794 | return; |
3795 | } |
3796 | auto ImportSummaryForTestingOrErr = getModuleSummaryIndex(Buffer: **ReadSummaryFile); |
3797 | if (!ImportSummaryForTestingOrErr) { |
3798 | logAllUnhandledErrors(E: ImportSummaryForTestingOrErr.takeError(), OS&: errs(), |
3799 | ErrorBanner: "Error parsing file '" + MemProfImportSummary + |
3800 | "': " ); |
3801 | return; |
3802 | } |
3803 | ImportSummaryForTesting = std::move(*ImportSummaryForTestingOrErr); |
3804 | ImportSummary = ImportSummaryForTesting.get(); |
3805 | } |
3806 | |
3807 | PreservedAnalyses MemProfContextDisambiguation::run(Module &M, |
3808 | ModuleAnalysisManager &AM) { |
3809 | auto &FAM = AM.getResult<FunctionAnalysisManagerModuleProxy>(IR&: M).getManager(); |
3810 | auto OREGetter = [&](Function *F) -> OptimizationRemarkEmitter & { |
3811 | return FAM.getResult<OptimizationRemarkEmitterAnalysis>(IR&: *F); |
3812 | }; |
3813 | if (!processModule(M, OREGetter)) |
3814 | return PreservedAnalyses::all(); |
3815 | return PreservedAnalyses::none(); |
3816 | } |
3817 | |
3818 | void MemProfContextDisambiguation::run( |
3819 | ModuleSummaryIndex &Index, |
3820 | llvm::function_ref<bool(GlobalValue::GUID, const GlobalValueSummary *)> |
3821 | isPrevailing) { |
3822 | // TODO: If/when other types of memprof cloning are enabled beyond just for |
3823 | // hot and cold, we will need to change this to individually control the |
3824 | // AllocationType passed to addStackNodesForMIB during CCG construction. |
3825 | // The index was set from the option, so these should be in sync. |
3826 | assert(Index.withSupportsHotColdNew() == SupportsHotColdNew); |
3827 | if (!SupportsHotColdNew) |
3828 | return; |
3829 | |
3830 | IndexCallsiteContextGraph CCG(Index, isPrevailing); |
3831 | CCG.process(); |
3832 | } |
3833 | |