1//==-- MemProfContextDisambiguation.cpp - Disambiguate contexts -------------=//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9// This file implements support for context disambiguation of allocation
10// calls for profile guided heap optimization. Specifically, it uses Memprof
11// profiles which indicate context specific allocation behavior (currently
12// distinguishing cold vs hot memory allocations). Cloning is performed to
13// expose the cold allocation call contexts, and the allocation calls are
14// subsequently annotated with an attribute for later transformation.
15//
16// The transformations can be performed either directly on IR (regular LTO), or
17// on a ThinLTO index (and later applied to the IR during the ThinLTO backend).
18// Both types of LTO operate on a the same base graph representation, which
19// uses CRTP to support either IR or Index formats.
20//
21//===----------------------------------------------------------------------===//
22
23#include "llvm/Transforms/IPO/MemProfContextDisambiguation.h"
24#include "llvm/ADT/DenseMap.h"
25#include "llvm/ADT/DenseSet.h"
26#include "llvm/ADT/MapVector.h"
27#include "llvm/ADT/SetOperations.h"
28#include "llvm/ADT/SmallPtrSet.h"
29#include "llvm/ADT/SmallSet.h"
30#include "llvm/ADT/SmallVector.h"
31#include "llvm/ADT/Statistic.h"
32#include "llvm/Analysis/MemoryProfileInfo.h"
33#include "llvm/Analysis/ModuleSummaryAnalysis.h"
34#include "llvm/Analysis/OptimizationRemarkEmitter.h"
35#include "llvm/Bitcode/BitcodeReader.h"
36#include "llvm/IR/Constants.h"
37#include "llvm/IR/Instructions.h"
38#include "llvm/IR/Module.h"
39#include "llvm/IR/ModuleSummaryIndex.h"
40#include "llvm/Pass.h"
41#include "llvm/Support/CommandLine.h"
42#include "llvm/Support/FileSystem.h"
43#include "llvm/Support/GraphWriter.h"
44#include "llvm/Support/raw_ostream.h"
45#include "llvm/Transforms/IPO.h"
46#include "llvm/Transforms/Utils/Cloning.h"
47#include <sstream>
48#include <unordered_map>
49#include <vector>
50using namespace llvm;
51using namespace llvm::memprof;
52
53#define DEBUG_TYPE "memprof-context-disambiguation"
54
55STATISTIC(FunctionClonesAnalysis,
56 "Number of function clones created during whole program analysis");
57STATISTIC(FunctionClonesThinBackend,
58 "Number of function clones created during ThinLTO backend");
59STATISTIC(FunctionsClonedThinBackend,
60 "Number of functions that had clones created during ThinLTO backend");
61STATISTIC(AllocTypeNotCold, "Number of not cold static allocations (possibly "
62 "cloned) during whole program analysis");
63STATISTIC(AllocTypeCold, "Number of cold static allocations (possibly cloned) "
64 "during whole program analysis");
65STATISTIC(AllocTypeNotColdThinBackend,
66 "Number of not cold static allocations (possibly cloned) during "
67 "ThinLTO backend");
68STATISTIC(AllocTypeColdThinBackend, "Number of cold static allocations "
69 "(possibly cloned) during ThinLTO backend");
70STATISTIC(OrigAllocsThinBackend,
71 "Number of original (not cloned) allocations with memprof profiles "
72 "during ThinLTO backend");
73STATISTIC(
74 AllocVersionsThinBackend,
75 "Number of allocation versions (including clones) during ThinLTO backend");
76STATISTIC(MaxAllocVersionsThinBackend,
77 "Maximum number of allocation versions created for an original "
78 "allocation during ThinLTO backend");
79STATISTIC(UnclonableAllocsThinBackend,
80 "Number of unclonable ambigous allocations during ThinLTO backend");
81STATISTIC(RemovedEdgesWithMismatchedCallees,
82 "Number of edges removed due to mismatched callees (profiled vs IR)");
83STATISTIC(FoundProfiledCalleeCount,
84 "Number of profiled callees found via tail calls");
85STATISTIC(FoundProfiledCalleeDepth,
86 "Aggregate depth of profiled callees found via tail calls");
87STATISTIC(FoundProfiledCalleeMaxDepth,
88 "Maximum depth of profiled callees found via tail calls");
89STATISTIC(FoundProfiledCalleeNonUniquelyCount,
90 "Number of profiled callees found via multiple tail call chains");
91
92static cl::opt<std::string> DotFilePathPrefix(
93 "memprof-dot-file-path-prefix", cl::init(Val: ""), cl::Hidden,
94 cl::value_desc("filename"),
95 cl::desc("Specify the path prefix of the MemProf dot files."));
96
97static cl::opt<bool> ExportToDot("memprof-export-to-dot", cl::init(Val: false),
98 cl::Hidden,
99 cl::desc("Export graph to dot files."));
100
101static cl::opt<bool>
102 DumpCCG("memprof-dump-ccg", cl::init(Val: false), cl::Hidden,
103 cl::desc("Dump CallingContextGraph to stdout after each stage."));
104
105static cl::opt<bool>
106 VerifyCCG("memprof-verify-ccg", cl::init(Val: false), cl::Hidden,
107 cl::desc("Perform verification checks on CallingContextGraph."));
108
109static cl::opt<bool>
110 VerifyNodes("memprof-verify-nodes", cl::init(Val: false), cl::Hidden,
111 cl::desc("Perform frequent verification checks on nodes."));
112
113static cl::opt<std::string> MemProfImportSummary(
114 "memprof-import-summary",
115 cl::desc("Import summary to use for testing the ThinLTO backend via opt"),
116 cl::Hidden);
117
118static cl::opt<unsigned>
119 TailCallSearchDepth("memprof-tail-call-search-depth", cl::init(Val: 5),
120 cl::Hidden,
121 cl::desc("Max depth to recursively search for missing "
122 "frames through tail calls."));
123
124namespace llvm {
125cl::opt<bool> EnableMemProfContextDisambiguation(
126 "enable-memprof-context-disambiguation", cl::init(Val: false), cl::Hidden,
127 cl::ZeroOrMore, cl::desc("Enable MemProf context disambiguation"));
128
129// Indicate we are linking with an allocator that supports hot/cold operator
130// new interfaces.
131cl::opt<bool> SupportsHotColdNew(
132 "supports-hot-cold-new", cl::init(Val: false), cl::Hidden,
133 cl::desc("Linking with hot/cold operator new interfaces"));
134} // namespace llvm
135
136namespace {
137/// CRTP base for graphs built from either IR or ThinLTO summary index.
138///
139/// The graph represents the call contexts in all memprof metadata on allocation
140/// calls, with nodes for the allocations themselves, as well as for the calls
141/// in each context. The graph is initially built from the allocation memprof
142/// metadata (or summary) MIBs. It is then updated to match calls with callsite
143/// metadata onto the nodes, updating it to reflect any inlining performed on
144/// those calls.
145///
146/// Each MIB (representing an allocation's call context with allocation
147/// behavior) is assigned a unique context id during the graph build. The edges
148/// and nodes in the graph are decorated with the context ids they carry. This
149/// is used to correctly update the graph when cloning is performed so that we
150/// can uniquify the context for a single (possibly cloned) allocation.
151template <typename DerivedCCG, typename FuncTy, typename CallTy>
152class CallsiteContextGraph {
153public:
154 CallsiteContextGraph() = default;
155 CallsiteContextGraph(const CallsiteContextGraph &) = default;
156 CallsiteContextGraph(CallsiteContextGraph &&) = default;
157
158 /// Main entry point to perform analysis and transformations on graph.
159 bool process();
160
161 /// Perform cloning on the graph necessary to uniquely identify the allocation
162 /// behavior of an allocation based on its context.
163 void identifyClones();
164
165 /// Assign callsite clones to functions, cloning functions as needed to
166 /// accommodate the combinations of their callsite clones reached by callers.
167 /// For regular LTO this clones functions and callsites in the IR, but for
168 /// ThinLTO the cloning decisions are noted in the summaries and later applied
169 /// in applyImport.
170 bool assignFunctions();
171
172 void dump() const;
173 void print(raw_ostream &OS) const;
174
175 friend raw_ostream &operator<<(raw_ostream &OS,
176 const CallsiteContextGraph &CCG) {
177 CCG.print(OS);
178 return OS;
179 }
180
181 friend struct GraphTraits<
182 const CallsiteContextGraph<DerivedCCG, FuncTy, CallTy> *>;
183 friend struct DOTGraphTraits<
184 const CallsiteContextGraph<DerivedCCG, FuncTy, CallTy> *>;
185
186 void exportToDot(std::string Label) const;
187
188 /// Represents a function clone via FuncTy pointer and clone number pair.
189 struct FuncInfo final
190 : public std::pair<FuncTy *, unsigned /*Clone number*/> {
191 using Base = std::pair<FuncTy *, unsigned>;
192 FuncInfo(const Base &B) : Base(B) {}
193 FuncInfo(FuncTy *F = nullptr, unsigned CloneNo = 0) : Base(F, CloneNo) {}
194 explicit operator bool() const { return this->first != nullptr; }
195 FuncTy *func() const { return this->first; }
196 unsigned cloneNo() const { return this->second; }
197 };
198
199 /// Represents a callsite clone via CallTy and clone number pair.
200 struct CallInfo final : public std::pair<CallTy, unsigned /*Clone number*/> {
201 using Base = std::pair<CallTy, unsigned>;
202 CallInfo(const Base &B) : Base(B) {}
203 CallInfo(CallTy Call = nullptr, unsigned CloneNo = 0)
204 : Base(Call, CloneNo) {}
205 explicit operator bool() const { return (bool)this->first; }
206 CallTy call() const { return this->first; }
207 unsigned cloneNo() const { return this->second; }
208 void setCloneNo(unsigned N) { this->second = N; }
209 void print(raw_ostream &OS) const {
210 if (!operator bool()) {
211 assert(!cloneNo());
212 OS << "null Call";
213 return;
214 }
215 call()->print(OS);
216 OS << "\t(clone " << cloneNo() << ")";
217 }
218 void dump() const {
219 print(OS&: dbgs());
220 dbgs() << "\n";
221 }
222 friend raw_ostream &operator<<(raw_ostream &OS, const CallInfo &Call) {
223 Call.print(OS);
224 return OS;
225 }
226 };
227
228 struct ContextEdge;
229
230 /// Node in the Callsite Context Graph
231 struct ContextNode {
232 // Keep this for now since in the IR case where we have an Instruction* it
233 // is not as immediately discoverable. Used for printing richer information
234 // when dumping graph.
235 bool IsAllocation;
236
237 // Keeps track of when the Call was reset to null because there was
238 // recursion.
239 bool Recursive = false;
240
241 // The corresponding allocation or interior call.
242 CallInfo Call;
243
244 // For alloc nodes this is a unique id assigned when constructed, and for
245 // callsite stack nodes it is the original stack id when the node is
246 // constructed from the memprof MIB metadata on the alloc nodes. Note that
247 // this is only used when matching callsite metadata onto the stack nodes
248 // created when processing the allocation memprof MIBs, and for labeling
249 // nodes in the dot graph. Therefore we don't bother to assign a value for
250 // clones.
251 uint64_t OrigStackOrAllocId = 0;
252
253 // This will be formed by ORing together the AllocationType enum values
254 // for contexts including this node.
255 uint8_t AllocTypes = 0;
256
257 // Edges to all callees in the profiled call stacks.
258 // TODO: Should this be a map (from Callee node) for more efficient lookup?
259 std::vector<std::shared_ptr<ContextEdge>> CalleeEdges;
260
261 // Edges to all callers in the profiled call stacks.
262 // TODO: Should this be a map (from Caller node) for more efficient lookup?
263 std::vector<std::shared_ptr<ContextEdge>> CallerEdges;
264
265 // The set of IDs for contexts including this node.
266 DenseSet<uint32_t> ContextIds;
267
268 // List of clones of this ContextNode, initially empty.
269 std::vector<ContextNode *> Clones;
270
271 // If a clone, points to the original uncloned node.
272 ContextNode *CloneOf = nullptr;
273
274 ContextNode(bool IsAllocation) : IsAllocation(IsAllocation), Call() {}
275
276 ContextNode(bool IsAllocation, CallInfo C)
277 : IsAllocation(IsAllocation), Call(C) {}
278
279 void addClone(ContextNode *Clone) {
280 if (CloneOf) {
281 CloneOf->Clones.push_back(Clone);
282 Clone->CloneOf = CloneOf;
283 } else {
284 Clones.push_back(Clone);
285 assert(!Clone->CloneOf);
286 Clone->CloneOf = this;
287 }
288 }
289
290 ContextNode *getOrigNode() {
291 if (!CloneOf)
292 return this;
293 return CloneOf;
294 }
295
296 void addOrUpdateCallerEdge(ContextNode *Caller, AllocationType AllocType,
297 unsigned int ContextId);
298
299 ContextEdge *findEdgeFromCallee(const ContextNode *Callee);
300 ContextEdge *findEdgeFromCaller(const ContextNode *Caller);
301 void eraseCalleeEdge(const ContextEdge *Edge);
302 void eraseCallerEdge(const ContextEdge *Edge);
303
304 void setCall(CallInfo C) { Call = C; }
305
306 bool hasCall() const { return (bool)Call.call(); }
307
308 void printCall(raw_ostream &OS) const { Call.print(OS); }
309
310 // True if this node was effectively removed from the graph, in which case
311 // its context id set, caller edges, and callee edges should all be empty.
312 bool isRemoved() const {
313 // Note that we can have non-empty context ids with empty caller and
314 // callee edges if the graph ends up with a single node.
315 if (ContextIds.empty())
316 assert(CalleeEdges.empty() && CallerEdges.empty() &&
317 "Context ids empty but at least one of callee and caller edges "
318 "were not!");
319 return ContextIds.empty();
320 }
321
322 void dump() const;
323 void print(raw_ostream &OS) const;
324
325 friend raw_ostream &operator<<(raw_ostream &OS, const ContextNode &Node) {
326 Node.print(OS);
327 return OS;
328 }
329 };
330
331 /// Edge in the Callsite Context Graph from a ContextNode N to a caller or
332 /// callee.
333 struct ContextEdge {
334 ContextNode *Callee;
335 ContextNode *Caller;
336
337 // This will be formed by ORing together the AllocationType enum values
338 // for contexts including this edge.
339 uint8_t AllocTypes = 0;
340
341 // The set of IDs for contexts including this edge.
342 DenseSet<uint32_t> ContextIds;
343
344 ContextEdge(ContextNode *Callee, ContextNode *Caller, uint8_t AllocType,
345 DenseSet<uint32_t> ContextIds)
346 : Callee(Callee), Caller(Caller), AllocTypes(AllocType),
347 ContextIds(ContextIds) {}
348
349 DenseSet<uint32_t> &getContextIds() { return ContextIds; }
350
351 void dump() const;
352 void print(raw_ostream &OS) const;
353
354 friend raw_ostream &operator<<(raw_ostream &OS, const ContextEdge &Edge) {
355 Edge.print(OS);
356 return OS;
357 }
358 };
359
360 /// Helpers to remove callee edges that have allocation type None (due to not
361 /// carrying any context ids) after transformations.
362 void removeNoneTypeCalleeEdges(ContextNode *Node);
363 void
364 recursivelyRemoveNoneTypeCalleeEdges(ContextNode *Node,
365 DenseSet<const ContextNode *> &Visited);
366
367protected:
368 /// Get a list of nodes corresponding to the stack ids in the given callsite
369 /// context.
370 template <class NodeT, class IteratorT>
371 std::vector<uint64_t>
372 getStackIdsWithContextNodes(CallStack<NodeT, IteratorT> &CallsiteContext);
373
374 /// Adds nodes for the given allocation and any stack ids on its memprof MIB
375 /// metadata (or summary).
376 ContextNode *addAllocNode(CallInfo Call, const FuncTy *F);
377
378 /// Adds nodes for the given MIB stack ids.
379 template <class NodeT, class IteratorT>
380 void addStackNodesForMIB(ContextNode *AllocNode,
381 CallStack<NodeT, IteratorT> &StackContext,
382 CallStack<NodeT, IteratorT> &CallsiteContext,
383 AllocationType AllocType);
384
385 /// Matches all callsite metadata (or summary) to the nodes created for
386 /// allocation memprof MIB metadata, synthesizing new nodes to reflect any
387 /// inlining performed on those callsite instructions.
388 void updateStackNodes();
389
390 /// Update graph to conservatively handle any callsite stack nodes that target
391 /// multiple different callee target functions.
392 void handleCallsitesWithMultipleTargets();
393
394 /// Save lists of calls with MemProf metadata in each function, for faster
395 /// iteration.
396 MapVector<FuncTy *, std::vector<CallInfo>> FuncToCallsWithMetadata;
397
398 /// Map from callsite node to the enclosing caller function.
399 std::map<const ContextNode *, const FuncTy *> NodeToCallingFunc;
400
401private:
402 using EdgeIter = typename std::vector<std::shared_ptr<ContextEdge>>::iterator;
403
404 using CallContextInfo = std::tuple<CallTy, std::vector<uint64_t>,
405 const FuncTy *, DenseSet<uint32_t>>;
406
407 /// Assigns the given Node to calls at or inlined into the location with
408 /// the Node's stack id, after post order traversing and processing its
409 /// caller nodes. Uses the call information recorded in the given
410 /// StackIdToMatchingCalls map, and creates new nodes for inlined sequences
411 /// as needed. Called by updateStackNodes which sets up the given
412 /// StackIdToMatchingCalls map.
413 void assignStackNodesPostOrder(
414 ContextNode *Node, DenseSet<const ContextNode *> &Visited,
415 DenseMap<uint64_t, std::vector<CallContextInfo>> &StackIdToMatchingCalls);
416
417 /// Duplicates the given set of context ids, updating the provided
418 /// map from each original id with the newly generated context ids,
419 /// and returning the new duplicated id set.
420 DenseSet<uint32_t> duplicateContextIds(
421 const DenseSet<uint32_t> &StackSequenceContextIds,
422 DenseMap<uint32_t, DenseSet<uint32_t>> &OldToNewContextIds);
423
424 /// Propagates all duplicated context ids across the graph.
425 void propagateDuplicateContextIds(
426 const DenseMap<uint32_t, DenseSet<uint32_t>> &OldToNewContextIds);
427
428 /// Connect the NewNode to OrigNode's callees if TowardsCallee is true,
429 /// else to its callers. Also updates OrigNode's edges to remove any context
430 /// ids moved to the newly created edge.
431 void connectNewNode(ContextNode *NewNode, ContextNode *OrigNode,
432 bool TowardsCallee);
433
434 /// Get the stack id corresponding to the given Id or Index (for IR this will
435 /// return itself, for a summary index this will return the id recorded in the
436 /// index for that stack id index value).
437 uint64_t getStackId(uint64_t IdOrIndex) const {
438 return static_cast<const DerivedCCG *>(this)->getStackId(IdOrIndex);
439 }
440
441 /// Returns true if the given call targets the callee of the given edge, or if
442 /// we were able to identify the call chain through intermediate tail calls.
443 /// In the latter case new context nodes are added to the graph for the
444 /// identified tail calls, and their synthesized nodes are added to
445 /// TailCallToContextNodeMap. The EdgeIter is updated in either case to the
446 /// next element after the input position (either incremented or updated after
447 /// removing the old edge).
448 bool
449 calleesMatch(CallTy Call, EdgeIter &EI,
450 MapVector<CallInfo, ContextNode *> &TailCallToContextNodeMap);
451
452 /// Returns true if the given call targets the given function, or if we were
453 /// able to identify the call chain through intermediate tail calls (in which
454 /// case FoundCalleeChain will be populated).
455 bool calleeMatchesFunc(
456 CallTy Call, const FuncTy *Func, const FuncTy *CallerFunc,
457 std::vector<std::pair<CallTy, FuncTy *>> &FoundCalleeChain) {
458 return static_cast<DerivedCCG *>(this)->calleeMatchesFunc(
459 Call, Func, CallerFunc, FoundCalleeChain);
460 }
461
462 /// Get a list of nodes corresponding to the stack ids in the given
463 /// callsite's context.
464 std::vector<uint64_t> getStackIdsWithContextNodesForCall(CallTy Call) {
465 return static_cast<DerivedCCG *>(this)->getStackIdsWithContextNodesForCall(
466 Call);
467 }
468
469 /// Get the last stack id in the context for callsite.
470 uint64_t getLastStackId(CallTy Call) {
471 return static_cast<DerivedCCG *>(this)->getLastStackId(Call);
472 }
473
474 /// Update the allocation call to record type of allocated memory.
475 void updateAllocationCall(CallInfo &Call, AllocationType AllocType) {
476 AllocType == AllocationType::Cold ? AllocTypeCold++ : AllocTypeNotCold++;
477 static_cast<DerivedCCG *>(this)->updateAllocationCall(Call, AllocType);
478 }
479
480 /// Update non-allocation call to invoke (possibly cloned) function
481 /// CalleeFunc.
482 void updateCall(CallInfo &CallerCall, FuncInfo CalleeFunc) {
483 static_cast<DerivedCCG *>(this)->updateCall(CallerCall, CalleeFunc);
484 }
485
486 /// Clone the given function for the given callsite, recording mapping of all
487 /// of the functions tracked calls to their new versions in the CallMap.
488 /// Assigns new clones to clone number CloneNo.
489 FuncInfo cloneFunctionForCallsite(
490 FuncInfo &Func, CallInfo &Call, std::map<CallInfo, CallInfo> &CallMap,
491 std::vector<CallInfo> &CallsWithMetadataInFunc, unsigned CloneNo) {
492 return static_cast<DerivedCCG *>(this)->cloneFunctionForCallsite(
493 Func, Call, CallMap, CallsWithMetadataInFunc, CloneNo);
494 }
495
496 /// Gets a label to use in the dot graph for the given call clone in the given
497 /// function.
498 std::string getLabel(const FuncTy *Func, const CallTy Call,
499 unsigned CloneNo) const {
500 return static_cast<const DerivedCCG *>(this)->getLabel(Func, Call, CloneNo);
501 }
502
503 /// Helpers to find the node corresponding to the given call or stackid.
504 ContextNode *getNodeForInst(const CallInfo &C);
505 ContextNode *getNodeForAlloc(const CallInfo &C);
506 ContextNode *getNodeForStackId(uint64_t StackId);
507
508 /// Removes the node information recorded for the given call.
509 void unsetNodeForInst(const CallInfo &C);
510
511 /// Computes the alloc type corresponding to the given context ids, by
512 /// unioning their recorded alloc types.
513 uint8_t computeAllocType(DenseSet<uint32_t> &ContextIds);
514
515 /// Returns the alloction type of the intersection of the contexts of two
516 /// nodes (based on their provided context id sets), optimized for the case
517 /// when Node1Ids is smaller than Node2Ids.
518 uint8_t intersectAllocTypesImpl(const DenseSet<uint32_t> &Node1Ids,
519 const DenseSet<uint32_t> &Node2Ids);
520
521 /// Returns the alloction type of the intersection of the contexts of two
522 /// nodes (based on their provided context id sets).
523 uint8_t intersectAllocTypes(const DenseSet<uint32_t> &Node1Ids,
524 const DenseSet<uint32_t> &Node2Ids);
525
526 /// Create a clone of Edge's callee and move Edge to that new callee node,
527 /// performing the necessary context id and allocation type updates.
528 /// If callee's caller edge iterator is supplied, it is updated when removing
529 /// the edge from that list. If ContextIdsToMove is non-empty, only that
530 /// subset of Edge's ids are moved to an edge to the new callee.
531 ContextNode *
532 moveEdgeToNewCalleeClone(const std::shared_ptr<ContextEdge> &Edge,
533 EdgeIter *CallerEdgeI = nullptr,
534 DenseSet<uint32_t> ContextIdsToMove = {});
535
536 /// Change the callee of Edge to existing callee clone NewCallee, performing
537 /// the necessary context id and allocation type updates.
538 /// If callee's caller edge iterator is supplied, it is updated when removing
539 /// the edge from that list. If ContextIdsToMove is non-empty, only that
540 /// subset of Edge's ids are moved to an edge to the new callee.
541 void moveEdgeToExistingCalleeClone(const std::shared_ptr<ContextEdge> &Edge,
542 ContextNode *NewCallee,
543 EdgeIter *CallerEdgeI = nullptr,
544 bool NewClone = false,
545 DenseSet<uint32_t> ContextIdsToMove = {});
546
547 /// Recursively perform cloning on the graph for the given Node and its
548 /// callers, in order to uniquely identify the allocation behavior of an
549 /// allocation given its context. The context ids of the allocation being
550 /// processed are given in AllocContextIds.
551 void identifyClones(ContextNode *Node, DenseSet<const ContextNode *> &Visited,
552 const DenseSet<uint32_t> &AllocContextIds);
553
554 /// Map from each context ID to the AllocationType assigned to that context.
555 std::map<uint32_t, AllocationType> ContextIdToAllocationType;
556
557 /// Identifies the context node created for a stack id when adding the MIB
558 /// contexts to the graph. This is used to locate the context nodes when
559 /// trying to assign the corresponding callsites with those stack ids to these
560 /// nodes.
561 std::map<uint64_t, ContextNode *> StackEntryIdToContextNodeMap;
562
563 /// Maps to track the calls to their corresponding nodes in the graph.
564 MapVector<CallInfo, ContextNode *> AllocationCallToContextNodeMap;
565 MapVector<CallInfo, ContextNode *> NonAllocationCallToContextNodeMap;
566
567 /// Owner of all ContextNode unique_ptrs.
568 std::vector<std::unique_ptr<ContextNode>> NodeOwner;
569
570 /// Perform sanity checks on graph when requested.
571 void check() const;
572
573 /// Keeps track of the last unique context id assigned.
574 unsigned int LastContextId = 0;
575};
576
577template <typename DerivedCCG, typename FuncTy, typename CallTy>
578using ContextNode =
579 typename CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::ContextNode;
580template <typename DerivedCCG, typename FuncTy, typename CallTy>
581using ContextEdge =
582 typename CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::ContextEdge;
583template <typename DerivedCCG, typename FuncTy, typename CallTy>
584using FuncInfo =
585 typename CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::FuncInfo;
586template <typename DerivedCCG, typename FuncTy, typename CallTy>
587using CallInfo =
588 typename CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::CallInfo;
589
590/// CRTP derived class for graphs built from IR (regular LTO).
591class ModuleCallsiteContextGraph
592 : public CallsiteContextGraph<ModuleCallsiteContextGraph, Function,
593 Instruction *> {
594public:
595 ModuleCallsiteContextGraph(
596 Module &M,
597 llvm::function_ref<OptimizationRemarkEmitter &(Function *)> OREGetter);
598
599private:
600 friend CallsiteContextGraph<ModuleCallsiteContextGraph, Function,
601 Instruction *>;
602
603 uint64_t getStackId(uint64_t IdOrIndex) const;
604 bool calleeMatchesFunc(
605 Instruction *Call, const Function *Func, const Function *CallerFunc,
606 std::vector<std::pair<Instruction *, Function *>> &FoundCalleeChain);
607 bool findProfiledCalleeThroughTailCalls(
608 const Function *ProfiledCallee, Value *CurCallee, unsigned Depth,
609 std::vector<std::pair<Instruction *, Function *>> &FoundCalleeChain,
610 bool &FoundMultipleCalleeChains);
611 uint64_t getLastStackId(Instruction *Call);
612 std::vector<uint64_t> getStackIdsWithContextNodesForCall(Instruction *Call);
613 void updateAllocationCall(CallInfo &Call, AllocationType AllocType);
614 void updateCall(CallInfo &CallerCall, FuncInfo CalleeFunc);
615 CallsiteContextGraph<ModuleCallsiteContextGraph, Function,
616 Instruction *>::FuncInfo
617 cloneFunctionForCallsite(FuncInfo &Func, CallInfo &Call,
618 std::map<CallInfo, CallInfo> &CallMap,
619 std::vector<CallInfo> &CallsWithMetadataInFunc,
620 unsigned CloneNo);
621 std::string getLabel(const Function *Func, const Instruction *Call,
622 unsigned CloneNo) const;
623
624 const Module &Mod;
625 llvm::function_ref<OptimizationRemarkEmitter &(Function *)> OREGetter;
626};
627
628/// Represents a call in the summary index graph, which can either be an
629/// allocation or an interior callsite node in an allocation's context.
630/// Holds a pointer to the corresponding data structure in the index.
631struct IndexCall : public PointerUnion<CallsiteInfo *, AllocInfo *> {
632 IndexCall() : PointerUnion() {}
633 IndexCall(std::nullptr_t) : IndexCall() {}
634 IndexCall(CallsiteInfo *StackNode) : PointerUnion(StackNode) {}
635 IndexCall(AllocInfo *AllocNode) : PointerUnion(AllocNode) {}
636 IndexCall(PointerUnion PT) : PointerUnion(PT) {}
637
638 IndexCall *operator->() { return this; }
639
640 PointerUnion<CallsiteInfo *, AllocInfo *> getBase() const { return *this; }
641
642 void print(raw_ostream &OS) const {
643 if (auto *AI = llvm::dyn_cast_if_present<AllocInfo *>(Val: getBase())) {
644 OS << *AI;
645 } else {
646 auto *CI = llvm::dyn_cast_if_present<CallsiteInfo *>(Val: getBase());
647 assert(CI);
648 OS << *CI;
649 }
650 }
651};
652
653/// CRTP derived class for graphs built from summary index (ThinLTO).
654class IndexCallsiteContextGraph
655 : public CallsiteContextGraph<IndexCallsiteContextGraph, FunctionSummary,
656 IndexCall> {
657public:
658 IndexCallsiteContextGraph(
659 ModuleSummaryIndex &Index,
660 llvm::function_ref<bool(GlobalValue::GUID, const GlobalValueSummary *)>
661 isPrevailing);
662
663 ~IndexCallsiteContextGraph() {
664 // Now that we are done with the graph it is safe to add the new
665 // CallsiteInfo structs to the function summary vectors. The graph nodes
666 // point into locations within these vectors, so we don't want to add them
667 // any earlier.
668 for (auto &I : FunctionCalleesToSynthesizedCallsiteInfos) {
669 auto *FS = I.first;
670 for (auto &Callsite : I.second)
671 FS->addCallsite(Callsite&: *Callsite.second);
672 }
673 }
674
675private:
676 friend CallsiteContextGraph<IndexCallsiteContextGraph, FunctionSummary,
677 IndexCall>;
678
679 uint64_t getStackId(uint64_t IdOrIndex) const;
680 bool calleeMatchesFunc(
681 IndexCall &Call, const FunctionSummary *Func,
682 const FunctionSummary *CallerFunc,
683 std::vector<std::pair<IndexCall, FunctionSummary *>> &FoundCalleeChain);
684 bool findProfiledCalleeThroughTailCalls(
685 ValueInfo ProfiledCallee, ValueInfo CurCallee, unsigned Depth,
686 std::vector<std::pair<IndexCall, FunctionSummary *>> &FoundCalleeChain,
687 bool &FoundMultipleCalleeChains);
688 uint64_t getLastStackId(IndexCall &Call);
689 std::vector<uint64_t> getStackIdsWithContextNodesForCall(IndexCall &Call);
690 void updateAllocationCall(CallInfo &Call, AllocationType AllocType);
691 void updateCall(CallInfo &CallerCall, FuncInfo CalleeFunc);
692 CallsiteContextGraph<IndexCallsiteContextGraph, FunctionSummary,
693 IndexCall>::FuncInfo
694 cloneFunctionForCallsite(FuncInfo &Func, CallInfo &Call,
695 std::map<CallInfo, CallInfo> &CallMap,
696 std::vector<CallInfo> &CallsWithMetadataInFunc,
697 unsigned CloneNo);
698 std::string getLabel(const FunctionSummary *Func, const IndexCall &Call,
699 unsigned CloneNo) const;
700
701 // Saves mapping from function summaries containing memprof records back to
702 // its VI, for use in checking and debugging.
703 std::map<const FunctionSummary *, ValueInfo> FSToVIMap;
704
705 const ModuleSummaryIndex &Index;
706 llvm::function_ref<bool(GlobalValue::GUID, const GlobalValueSummary *)>
707 isPrevailing;
708
709 // Saves/owns the callsite info structures synthesized for missing tail call
710 // frames that we discover while building the graph.
711 // It maps from the summary of the function making the tail call, to a map
712 // of callee ValueInfo to corresponding synthesized callsite info.
713 std::unordered_map<FunctionSummary *,
714 std::map<ValueInfo, std::unique_ptr<CallsiteInfo>>>
715 FunctionCalleesToSynthesizedCallsiteInfos;
716};
717} // namespace
718
719namespace llvm {
720template <>
721struct DenseMapInfo<typename CallsiteContextGraph<
722 ModuleCallsiteContextGraph, Function, Instruction *>::CallInfo>
723 : public DenseMapInfo<std::pair<Instruction *, unsigned>> {};
724template <>
725struct DenseMapInfo<typename CallsiteContextGraph<
726 IndexCallsiteContextGraph, FunctionSummary, IndexCall>::CallInfo>
727 : public DenseMapInfo<std::pair<IndexCall, unsigned>> {};
728template <>
729struct DenseMapInfo<IndexCall>
730 : public DenseMapInfo<PointerUnion<CallsiteInfo *, AllocInfo *>> {};
731} // end namespace llvm
732
733namespace {
734
735struct FieldSeparator {
736 bool Skip = true;
737 const char *Sep;
738
739 FieldSeparator(const char *Sep = ", ") : Sep(Sep) {}
740};
741
742raw_ostream &operator<<(raw_ostream &OS, FieldSeparator &FS) {
743 if (FS.Skip) {
744 FS.Skip = false;
745 return OS;
746 }
747 return OS << FS.Sep;
748}
749
750// Map the uint8_t alloc types (which may contain NotCold|Cold) to the alloc
751// type we should actually use on the corresponding allocation.
752// If we can't clone a node that has NotCold+Cold alloc type, we will fall
753// back to using NotCold. So don't bother cloning to distinguish NotCold+Cold
754// from NotCold.
755AllocationType allocTypeToUse(uint8_t AllocTypes) {
756 assert(AllocTypes != (uint8_t)AllocationType::None);
757 if (AllocTypes ==
758 ((uint8_t)AllocationType::NotCold | (uint8_t)AllocationType::Cold))
759 return AllocationType::NotCold;
760 else
761 return (AllocationType)AllocTypes;
762}
763
764// Helper to check if the alloc types for all edges recorded in the
765// InAllocTypes vector match the alloc types for all edges in the Edges
766// vector.
767template <typename DerivedCCG, typename FuncTy, typename CallTy>
768bool allocTypesMatch(
769 const std::vector<uint8_t> &InAllocTypes,
770 const std::vector<std::shared_ptr<ContextEdge<DerivedCCG, FuncTy, CallTy>>>
771 &Edges) {
772 return std::equal(
773 InAllocTypes.begin(), InAllocTypes.end(), Edges.begin(),
774 [](const uint8_t &l,
775 const std::shared_ptr<ContextEdge<DerivedCCG, FuncTy, CallTy>> &r) {
776 // Can share if one of the edges is None type - don't
777 // care about the type along that edge as it doesn't
778 // exist for those context ids.
779 if (l == (uint8_t)AllocationType::None ||
780 r->AllocTypes == (uint8_t)AllocationType::None)
781 return true;
782 return allocTypeToUse(AllocTypes: l) == allocTypeToUse(r->AllocTypes);
783 });
784}
785
786} // end anonymous namespace
787
788template <typename DerivedCCG, typename FuncTy, typename CallTy>
789typename CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::ContextNode *
790CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::getNodeForInst(
791 const CallInfo &C) {
792 ContextNode *Node = getNodeForAlloc(C);
793 if (Node)
794 return Node;
795
796 return NonAllocationCallToContextNodeMap.lookup(C);
797}
798
799template <typename DerivedCCG, typename FuncTy, typename CallTy>
800typename CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::ContextNode *
801CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::getNodeForAlloc(
802 const CallInfo &C) {
803 return AllocationCallToContextNodeMap.lookup(C);
804}
805
806template <typename DerivedCCG, typename FuncTy, typename CallTy>
807typename CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::ContextNode *
808CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::getNodeForStackId(
809 uint64_t StackId) {
810 auto StackEntryNode = StackEntryIdToContextNodeMap.find(StackId);
811 if (StackEntryNode != StackEntryIdToContextNodeMap.end())
812 return StackEntryNode->second;
813 return nullptr;
814}
815
816template <typename DerivedCCG, typename FuncTy, typename CallTy>
817void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::unsetNodeForInst(
818 const CallInfo &C) {
819 AllocationCallToContextNodeMap.erase(C) ||
820 NonAllocationCallToContextNodeMap.erase(C);
821 assert(!AllocationCallToContextNodeMap.count(C) &&
822 !NonAllocationCallToContextNodeMap.count(C));
823}
824
825template <typename DerivedCCG, typename FuncTy, typename CallTy>
826void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::ContextNode::
827 addOrUpdateCallerEdge(ContextNode *Caller, AllocationType AllocType,
828 unsigned int ContextId) {
829 for (auto &Edge : CallerEdges) {
830 if (Edge->Caller == Caller) {
831 Edge->AllocTypes |= (uint8_t)AllocType;
832 Edge->getContextIds().insert(ContextId);
833 return;
834 }
835 }
836 std::shared_ptr<ContextEdge> Edge = std::make_shared<ContextEdge>(
837 this, Caller, (uint8_t)AllocType, DenseSet<uint32_t>({ContextId}));
838 CallerEdges.push_back(Edge);
839 Caller->CalleeEdges.push_back(Edge);
840}
841
842template <typename DerivedCCG, typename FuncTy, typename CallTy>
843void CallsiteContextGraph<
844 DerivedCCG, FuncTy, CallTy>::removeNoneTypeCalleeEdges(ContextNode *Node) {
845 for (auto EI = Node->CalleeEdges.begin(); EI != Node->CalleeEdges.end();) {
846 auto Edge = *EI;
847 if (Edge->AllocTypes == (uint8_t)AllocationType::None) {
848 assert(Edge->ContextIds.empty());
849 Edge->Callee->eraseCallerEdge(Edge.get());
850 EI = Node->CalleeEdges.erase(EI);
851 } else
852 ++EI;
853 }
854}
855
856template <typename DerivedCCG, typename FuncTy, typename CallTy>
857typename CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::ContextEdge *
858CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::ContextNode::
859 findEdgeFromCallee(const ContextNode *Callee) {
860 for (const auto &Edge : CalleeEdges)
861 if (Edge->Callee == Callee)
862 return Edge.get();
863 return nullptr;
864}
865
866template <typename DerivedCCG, typename FuncTy, typename CallTy>
867typename CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::ContextEdge *
868CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::ContextNode::
869 findEdgeFromCaller(const ContextNode *Caller) {
870 for (const auto &Edge : CallerEdges)
871 if (Edge->Caller == Caller)
872 return Edge.get();
873 return nullptr;
874}
875
876template <typename DerivedCCG, typename FuncTy, typename CallTy>
877void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::ContextNode::
878 eraseCalleeEdge(const ContextEdge *Edge) {
879 auto EI = llvm::find_if(
880 CalleeEdges, [Edge](const std::shared_ptr<ContextEdge> &CalleeEdge) {
881 return CalleeEdge.get() == Edge;
882 });
883 assert(EI != CalleeEdges.end());
884 CalleeEdges.erase(EI);
885}
886
887template <typename DerivedCCG, typename FuncTy, typename CallTy>
888void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::ContextNode::
889 eraseCallerEdge(const ContextEdge *Edge) {
890 auto EI = llvm::find_if(
891 CallerEdges, [Edge](const std::shared_ptr<ContextEdge> &CallerEdge) {
892 return CallerEdge.get() == Edge;
893 });
894 assert(EI != CallerEdges.end());
895 CallerEdges.erase(EI);
896}
897
898template <typename DerivedCCG, typename FuncTy, typename CallTy>
899uint8_t CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::computeAllocType(
900 DenseSet<uint32_t> &ContextIds) {
901 uint8_t BothTypes =
902 (uint8_t)AllocationType::Cold | (uint8_t)AllocationType::NotCold;
903 uint8_t AllocType = (uint8_t)AllocationType::None;
904 for (auto Id : ContextIds) {
905 AllocType |= (uint8_t)ContextIdToAllocationType[Id];
906 // Bail early if alloc type reached both, no further refinement.
907 if (AllocType == BothTypes)
908 return AllocType;
909 }
910 return AllocType;
911}
912
913template <typename DerivedCCG, typename FuncTy, typename CallTy>
914uint8_t
915CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::intersectAllocTypesImpl(
916 const DenseSet<uint32_t> &Node1Ids, const DenseSet<uint32_t> &Node2Ids) {
917 uint8_t BothTypes =
918 (uint8_t)AllocationType::Cold | (uint8_t)AllocationType::NotCold;
919 uint8_t AllocType = (uint8_t)AllocationType::None;
920 for (auto Id : Node1Ids) {
921 if (!Node2Ids.count(V: Id))
922 continue;
923 AllocType |= (uint8_t)ContextIdToAllocationType[Id];
924 // Bail early if alloc type reached both, no further refinement.
925 if (AllocType == BothTypes)
926 return AllocType;
927 }
928 return AllocType;
929}
930
931template <typename DerivedCCG, typename FuncTy, typename CallTy>
932uint8_t CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::intersectAllocTypes(
933 const DenseSet<uint32_t> &Node1Ids, const DenseSet<uint32_t> &Node2Ids) {
934 if (Node1Ids.size() < Node2Ids.size())
935 return intersectAllocTypesImpl(Node1Ids, Node2Ids);
936 else
937 return intersectAllocTypesImpl(Node1Ids: Node2Ids, Node2Ids: Node1Ids);
938}
939
940template <typename DerivedCCG, typename FuncTy, typename CallTy>
941typename CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::ContextNode *
942CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::addAllocNode(
943 CallInfo Call, const FuncTy *F) {
944 assert(!getNodeForAlloc(Call));
945 NodeOwner.push_back(
946 std::make_unique<ContextNode>(/*IsAllocation=*/true, Call));
947 ContextNode *AllocNode = NodeOwner.back().get();
948 AllocationCallToContextNodeMap[Call] = AllocNode;
949 NodeToCallingFunc[AllocNode] = F;
950 // Use LastContextId as a uniq id for MIB allocation nodes.
951 AllocNode->OrigStackOrAllocId = LastContextId;
952 // Alloc type should be updated as we add in the MIBs. We should assert
953 // afterwards that it is not still None.
954 AllocNode->AllocTypes = (uint8_t)AllocationType::None;
955
956 return AllocNode;
957}
958
959template <typename DerivedCCG, typename FuncTy, typename CallTy>
960template <class NodeT, class IteratorT>
961void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::addStackNodesForMIB(
962 ContextNode *AllocNode, CallStack<NodeT, IteratorT> &StackContext,
963 CallStack<NodeT, IteratorT> &CallsiteContext, AllocationType AllocType) {
964 // Treating the hot alloc type as NotCold before the disambiguation for "hot"
965 // is done.
966 if (AllocType == AllocationType::Hot)
967 AllocType = AllocationType::NotCold;
968
969 ContextIdToAllocationType[++LastContextId] = AllocType;
970
971 // Update alloc type and context ids for this MIB.
972 AllocNode->AllocTypes |= (uint8_t)AllocType;
973 AllocNode->ContextIds.insert(LastContextId);
974
975 // Now add or update nodes for each stack id in alloc's context.
976 // Later when processing the stack ids on non-alloc callsites we will adjust
977 // for any inlining in the context.
978 ContextNode *PrevNode = AllocNode;
979 // Look for recursion (direct recursion should have been collapsed by
980 // module summary analysis, here we should just be detecting mutual
981 // recursion). Mark these nodes so we don't try to clone.
982 SmallSet<uint64_t, 8> StackIdSet;
983 // Skip any on the allocation call (inlining).
984 for (auto ContextIter = StackContext.beginAfterSharedPrefix(CallsiteContext);
985 ContextIter != StackContext.end(); ++ContextIter) {
986 auto StackId = getStackId(IdOrIndex: *ContextIter);
987 ContextNode *StackNode = getNodeForStackId(StackId);
988 if (!StackNode) {
989 NodeOwner.push_back(
990 std::make_unique<ContextNode>(/*IsAllocation=*/false));
991 StackNode = NodeOwner.back().get();
992 StackEntryIdToContextNodeMap[StackId] = StackNode;
993 StackNode->OrigStackOrAllocId = StackId;
994 }
995 auto Ins = StackIdSet.insert(StackId);
996 if (!Ins.second)
997 StackNode->Recursive = true;
998 StackNode->ContextIds.insert(LastContextId);
999 StackNode->AllocTypes |= (uint8_t)AllocType;
1000 PrevNode->addOrUpdateCallerEdge(StackNode, AllocType, LastContextId);
1001 PrevNode = StackNode;
1002 }
1003}
1004
1005template <typename DerivedCCG, typename FuncTy, typename CallTy>
1006DenseSet<uint32_t>
1007CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::duplicateContextIds(
1008 const DenseSet<uint32_t> &StackSequenceContextIds,
1009 DenseMap<uint32_t, DenseSet<uint32_t>> &OldToNewContextIds) {
1010 DenseSet<uint32_t> NewContextIds;
1011 for (auto OldId : StackSequenceContextIds) {
1012 NewContextIds.insert(V: ++LastContextId);
1013 OldToNewContextIds[OldId].insert(V: LastContextId);
1014 assert(ContextIdToAllocationType.count(OldId));
1015 // The new context has the same allocation type as original.
1016 ContextIdToAllocationType[LastContextId] = ContextIdToAllocationType[OldId];
1017 }
1018 return NewContextIds;
1019}
1020
1021template <typename DerivedCCG, typename FuncTy, typename CallTy>
1022void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::
1023 propagateDuplicateContextIds(
1024 const DenseMap<uint32_t, DenseSet<uint32_t>> &OldToNewContextIds) {
1025 // Build a set of duplicated context ids corresponding to the input id set.
1026 auto GetNewIds = [&OldToNewContextIds](const DenseSet<uint32_t> &ContextIds) {
1027 DenseSet<uint32_t> NewIds;
1028 for (auto Id : ContextIds)
1029 if (auto NewId = OldToNewContextIds.find(Val: Id);
1030 NewId != OldToNewContextIds.end())
1031 NewIds.insert(I: NewId->second.begin(), E: NewId->second.end());
1032 return NewIds;
1033 };
1034
1035 // Recursively update context ids sets along caller edges.
1036 auto UpdateCallers = [&](ContextNode *Node,
1037 DenseSet<const ContextEdge *> &Visited,
1038 auto &&UpdateCallers) -> void {
1039 for (const auto &Edge : Node->CallerEdges) {
1040 auto Inserted = Visited.insert(Edge.get());
1041 if (!Inserted.second)
1042 continue;
1043 ContextNode *NextNode = Edge->Caller;
1044 DenseSet<uint32_t> NewIdsToAdd = GetNewIds(Edge->getContextIds());
1045 // Only need to recursively iterate to NextNode via this caller edge if
1046 // it resulted in any added ids to NextNode.
1047 if (!NewIdsToAdd.empty()) {
1048 Edge->getContextIds().insert(NewIdsToAdd.begin(), NewIdsToAdd.end());
1049 NextNode->ContextIds.insert(NewIdsToAdd.begin(), NewIdsToAdd.end());
1050 UpdateCallers(NextNode, Visited, UpdateCallers);
1051 }
1052 }
1053 };
1054
1055 DenseSet<const ContextEdge *> Visited;
1056 for (auto &Entry : AllocationCallToContextNodeMap) {
1057 auto *Node = Entry.second;
1058 // Update ids on the allocation nodes before calling the recursive
1059 // update along caller edges, since this simplifies the logic during
1060 // that traversal.
1061 DenseSet<uint32_t> NewIdsToAdd = GetNewIds(Node->ContextIds);
1062 Node->ContextIds.insert(NewIdsToAdd.begin(), NewIdsToAdd.end());
1063 UpdateCallers(Node, Visited, UpdateCallers);
1064 }
1065}
1066
1067template <typename DerivedCCG, typename FuncTy, typename CallTy>
1068void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::connectNewNode(
1069 ContextNode *NewNode, ContextNode *OrigNode, bool TowardsCallee) {
1070 // Make a copy of the context ids, since this will be adjusted below as they
1071 // are moved.
1072 DenseSet<uint32_t> RemainingContextIds = NewNode->ContextIds;
1073 auto &OrigEdges =
1074 TowardsCallee ? OrigNode->CalleeEdges : OrigNode->CallerEdges;
1075 // Increment iterator in loop so that we can remove edges as needed.
1076 for (auto EI = OrigEdges.begin(); EI != OrigEdges.end();) {
1077 auto Edge = *EI;
1078 // Remove any matching context ids from Edge, return set that were found and
1079 // removed, these are the new edge's context ids. Also update the remaining
1080 // (not found ids).
1081 DenseSet<uint32_t> NewEdgeContextIds, NotFoundContextIds;
1082 set_subtract(Edge->getContextIds(), RemainingContextIds, NewEdgeContextIds,
1083 NotFoundContextIds);
1084 RemainingContextIds.swap(RHS&: NotFoundContextIds);
1085 // If no matching context ids for this edge, skip it.
1086 if (NewEdgeContextIds.empty()) {
1087 ++EI;
1088 continue;
1089 }
1090 if (TowardsCallee) {
1091 auto NewEdge = std::make_shared<ContextEdge>(
1092 Edge->Callee, NewNode, computeAllocType(ContextIds&: NewEdgeContextIds),
1093 NewEdgeContextIds);
1094 NewNode->CalleeEdges.push_back(NewEdge);
1095 NewEdge->Callee->CallerEdges.push_back(NewEdge);
1096 } else {
1097 auto NewEdge = std::make_shared<ContextEdge>(
1098 NewNode, Edge->Caller, computeAllocType(ContextIds&: NewEdgeContextIds),
1099 NewEdgeContextIds);
1100 NewNode->CallerEdges.push_back(NewEdge);
1101 NewEdge->Caller->CalleeEdges.push_back(NewEdge);
1102 }
1103 // Remove old edge if context ids empty.
1104 if (Edge->getContextIds().empty()) {
1105 if (TowardsCallee) {
1106 Edge->Callee->eraseCallerEdge(Edge.get());
1107 EI = OrigNode->CalleeEdges.erase(EI);
1108 } else {
1109 Edge->Caller->eraseCalleeEdge(Edge.get());
1110 EI = OrigNode->CallerEdges.erase(EI);
1111 }
1112 continue;
1113 }
1114 ++EI;
1115 }
1116}
1117
1118template <typename DerivedCCG, typename FuncTy, typename CallTy>
1119void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::
1120 assignStackNodesPostOrder(ContextNode *Node,
1121 DenseSet<const ContextNode *> &Visited,
1122 DenseMap<uint64_t, std::vector<CallContextInfo>>
1123 &StackIdToMatchingCalls) {
1124 auto Inserted = Visited.insert(Node);
1125 if (!Inserted.second)
1126 return;
1127 // Post order traversal. Iterate over a copy since we may add nodes and
1128 // therefore new callers during the recursive call, invalidating any
1129 // iterator over the original edge vector. We don't need to process these
1130 // new nodes as they were already processed on creation.
1131 auto CallerEdges = Node->CallerEdges;
1132 for (auto &Edge : CallerEdges) {
1133 // Skip any that have been removed during the recursion.
1134 if (!Edge)
1135 continue;
1136 assignStackNodesPostOrder(Node: Edge->Caller, Visited, StackIdToMatchingCalls);
1137 }
1138
1139 // If this node's stack id is in the map, update the graph to contain new
1140 // nodes representing any inlining at interior callsites. Note we move the
1141 // associated context ids over to the new nodes.
1142
1143 // Ignore this node if it is for an allocation or we didn't record any
1144 // stack id lists ending at it.
1145 if (Node->IsAllocation ||
1146 !StackIdToMatchingCalls.count(Node->OrigStackOrAllocId))
1147 return;
1148
1149 auto &Calls = StackIdToMatchingCalls[Node->OrigStackOrAllocId];
1150 // Handle the simple case first. A single call with a single stack id.
1151 // In this case there is no need to create any new context nodes, simply
1152 // assign the context node for stack id to this Call.
1153 if (Calls.size() == 1) {
1154 auto &[Call, Ids, Func, SavedContextIds] = Calls[0];
1155 if (Ids.size() == 1) {
1156 assert(SavedContextIds.empty());
1157 // It should be this Node
1158 assert(Node == getNodeForStackId(Ids[0]));
1159 if (Node->Recursive)
1160 return;
1161 Node->setCall(Call);
1162 NonAllocationCallToContextNodeMap[Call] = Node;
1163 NodeToCallingFunc[Node] = Func;
1164 return;
1165 }
1166 }
1167
1168 // Find the node for the last stack id, which should be the same
1169 // across all calls recorded for this id, and is this node's id.
1170 uint64_t LastId = Node->OrigStackOrAllocId;
1171 ContextNode *LastNode = getNodeForStackId(StackId: LastId);
1172 // We should only have kept stack ids that had nodes.
1173 assert(LastNode);
1174
1175 for (unsigned I = 0; I < Calls.size(); I++) {
1176 auto &[Call, Ids, Func, SavedContextIds] = Calls[I];
1177 // Skip any for which we didn't assign any ids, these don't get a node in
1178 // the graph.
1179 if (SavedContextIds.empty())
1180 continue;
1181
1182 assert(LastId == Ids.back());
1183
1184 ContextNode *FirstNode = getNodeForStackId(StackId: Ids[0]);
1185 assert(FirstNode);
1186
1187 // Recompute the context ids for this stack id sequence (the
1188 // intersection of the context ids of the corresponding nodes).
1189 // Start with the ids we saved in the map for this call, which could be
1190 // duplicated context ids. We have to recompute as we might have overlap
1191 // overlap between the saved context ids for different last nodes, and
1192 // removed them already during the post order traversal.
1193 set_intersect(SavedContextIds, FirstNode->ContextIds);
1194 ContextNode *PrevNode = nullptr;
1195 for (auto Id : Ids) {
1196 ContextNode *CurNode = getNodeForStackId(StackId: Id);
1197 // We should only have kept stack ids that had nodes and weren't
1198 // recursive.
1199 assert(CurNode);
1200 assert(!CurNode->Recursive);
1201 if (!PrevNode) {
1202 PrevNode = CurNode;
1203 continue;
1204 }
1205 auto *Edge = CurNode->findEdgeFromCallee(PrevNode);
1206 if (!Edge) {
1207 SavedContextIds.clear();
1208 break;
1209 }
1210 PrevNode = CurNode;
1211 set_intersect(SavedContextIds, Edge->getContextIds());
1212
1213 // If we now have no context ids for clone, skip this call.
1214 if (SavedContextIds.empty())
1215 break;
1216 }
1217 if (SavedContextIds.empty())
1218 continue;
1219
1220 // Create new context node.
1221 NodeOwner.push_back(
1222 std::make_unique<ContextNode>(/*IsAllocation=*/false, Call));
1223 ContextNode *NewNode = NodeOwner.back().get();
1224 NodeToCallingFunc[NewNode] = Func;
1225 NonAllocationCallToContextNodeMap[Call] = NewNode;
1226 NewNode->ContextIds = SavedContextIds;
1227 NewNode->AllocTypes = computeAllocType(ContextIds&: NewNode->ContextIds);
1228
1229 // Connect to callees of innermost stack frame in inlined call chain.
1230 // This updates context ids for FirstNode's callee's to reflect those
1231 // moved to NewNode.
1232 connectNewNode(NewNode, OrigNode: FirstNode, /*TowardsCallee=*/true);
1233
1234 // Connect to callers of outermost stack frame in inlined call chain.
1235 // This updates context ids for FirstNode's caller's to reflect those
1236 // moved to NewNode.
1237 connectNewNode(NewNode, OrigNode: LastNode, /*TowardsCallee=*/false);
1238
1239 // Now we need to remove context ids from edges/nodes between First and
1240 // Last Node.
1241 PrevNode = nullptr;
1242 for (auto Id : Ids) {
1243 ContextNode *CurNode = getNodeForStackId(StackId: Id);
1244 // We should only have kept stack ids that had nodes.
1245 assert(CurNode);
1246
1247 // Remove the context ids moved to NewNode from CurNode, and the
1248 // edge from the prior node.
1249 set_subtract(CurNode->ContextIds, NewNode->ContextIds);
1250 if (PrevNode) {
1251 auto *PrevEdge = CurNode->findEdgeFromCallee(PrevNode);
1252 assert(PrevEdge);
1253 set_subtract(PrevEdge->getContextIds(), NewNode->ContextIds);
1254 if (PrevEdge->getContextIds().empty()) {
1255 PrevNode->eraseCallerEdge(PrevEdge);
1256 CurNode->eraseCalleeEdge(PrevEdge);
1257 }
1258 }
1259 PrevNode = CurNode;
1260 }
1261 }
1262}
1263
1264template <typename DerivedCCG, typename FuncTy, typename CallTy>
1265void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::updateStackNodes() {
1266 // Map of stack id to all calls with that as the last (outermost caller)
1267 // callsite id that has a context node (some might not due to pruning
1268 // performed during matching of the allocation profile contexts).
1269 // The CallContextInfo contains the Call and a list of its stack ids with
1270 // ContextNodes, the function containing Call, and the set of context ids
1271 // the analysis will eventually identify for use in any new node created
1272 // for that callsite.
1273 DenseMap<uint64_t, std::vector<CallContextInfo>> StackIdToMatchingCalls;
1274 for (auto &[Func, CallsWithMetadata] : FuncToCallsWithMetadata) {
1275 for (auto &Call : CallsWithMetadata) {
1276 // Ignore allocations, already handled.
1277 if (AllocationCallToContextNodeMap.count(Call))
1278 continue;
1279 auto StackIdsWithContextNodes =
1280 getStackIdsWithContextNodesForCall(Call: Call.call());
1281 // If there were no nodes created for MIBs on allocs (maybe this was in
1282 // the unambiguous part of the MIB stack that was pruned), ignore.
1283 if (StackIdsWithContextNodes.empty())
1284 continue;
1285 // Otherwise, record this Call along with the list of ids for the last
1286 // (outermost caller) stack id with a node.
1287 StackIdToMatchingCalls[StackIdsWithContextNodes.back()].push_back(
1288 {Call.call(), StackIdsWithContextNodes, Func, {}});
1289 }
1290 }
1291
1292 // First make a pass through all stack ids that correspond to a call,
1293 // as identified in the above loop. Compute the context ids corresponding to
1294 // each of these calls when they correspond to multiple stack ids due to
1295 // due to inlining. Perform any duplication of context ids required when
1296 // there is more than one call with the same stack ids. Their (possibly newly
1297 // duplicated) context ids are saved in the StackIdToMatchingCalls map.
1298 DenseMap<uint32_t, DenseSet<uint32_t>> OldToNewContextIds;
1299 for (auto &It : StackIdToMatchingCalls) {
1300 auto &Calls = It.getSecond();
1301 // Skip single calls with a single stack id. These don't need a new node.
1302 if (Calls.size() == 1) {
1303 auto &Ids = std::get<1>(Calls[0]);
1304 if (Ids.size() == 1)
1305 continue;
1306 }
1307 // In order to do the best and maximal matching of inlined calls to context
1308 // node sequences we will sort the vectors of stack ids in descending order
1309 // of length, and within each length, lexicographically by stack id. The
1310 // latter is so that we can specially handle calls that have identical stack
1311 // id sequences (either due to cloning or artificially because of the MIB
1312 // context pruning).
1313 std::stable_sort(Calls.begin(), Calls.end(),
1314 [](const CallContextInfo &A, const CallContextInfo &B) {
1315 auto &IdsA = std::get<1>(A);
1316 auto &IdsB = std::get<1>(B);
1317 return IdsA.size() > IdsB.size() ||
1318 (IdsA.size() == IdsB.size() && IdsA < IdsB);
1319 });
1320
1321 // Find the node for the last stack id, which should be the same
1322 // across all calls recorded for this id, and is the id for this
1323 // entry in the StackIdToMatchingCalls map.
1324 uint64_t LastId = It.getFirst();
1325 ContextNode *LastNode = getNodeForStackId(StackId: LastId);
1326 // We should only have kept stack ids that had nodes.
1327 assert(LastNode);
1328
1329 if (LastNode->Recursive)
1330 continue;
1331
1332 // Initialize the context ids with the last node's. We will subsequently
1333 // refine the context ids by computing the intersection along all edges.
1334 DenseSet<uint32_t> LastNodeContextIds = LastNode->ContextIds;
1335 assert(!LastNodeContextIds.empty());
1336
1337 for (unsigned I = 0; I < Calls.size(); I++) {
1338 auto &[Call, Ids, Func, SavedContextIds] = Calls[I];
1339 assert(SavedContextIds.empty());
1340 assert(LastId == Ids.back());
1341
1342 // First compute the context ids for this stack id sequence (the
1343 // intersection of the context ids of the corresponding nodes).
1344 // Start with the remaining saved ids for the last node.
1345 assert(!LastNodeContextIds.empty());
1346 DenseSet<uint32_t> StackSequenceContextIds = LastNodeContextIds;
1347
1348 ContextNode *PrevNode = LastNode;
1349 ContextNode *CurNode = LastNode;
1350 bool Skip = false;
1351
1352 // Iterate backwards through the stack Ids, starting after the last Id
1353 // in the list, which was handled once outside for all Calls.
1354 for (auto IdIter = Ids.rbegin() + 1; IdIter != Ids.rend(); IdIter++) {
1355 auto Id = *IdIter;
1356 CurNode = getNodeForStackId(StackId: Id);
1357 // We should only have kept stack ids that had nodes.
1358 assert(CurNode);
1359
1360 if (CurNode->Recursive) {
1361 Skip = true;
1362 break;
1363 }
1364
1365 auto *Edge = CurNode->findEdgeFromCaller(PrevNode);
1366 // If there is no edge then the nodes belong to different MIB contexts,
1367 // and we should skip this inlined context sequence. For example, this
1368 // particular inlined context may include stack ids A->B, and we may
1369 // indeed have nodes for both A and B, but it is possible that they were
1370 // never profiled in sequence in a single MIB for any allocation (i.e.
1371 // we might have profiled an allocation that involves the callsite A,
1372 // but through a different one of its callee callsites, and we might
1373 // have profiled an allocation that involves callsite B, but reached
1374 // from a different caller callsite).
1375 if (!Edge) {
1376 Skip = true;
1377 break;
1378 }
1379 PrevNode = CurNode;
1380
1381 // Update the context ids, which is the intersection of the ids along
1382 // all edges in the sequence.
1383 set_intersect(StackSequenceContextIds, Edge->getContextIds());
1384
1385 // If we now have no context ids for clone, skip this call.
1386 if (StackSequenceContextIds.empty()) {
1387 Skip = true;
1388 break;
1389 }
1390 }
1391 if (Skip)
1392 continue;
1393
1394 // If some of this call's stack ids did not have corresponding nodes (due
1395 // to pruning), don't include any context ids for contexts that extend
1396 // beyond these nodes. Otherwise we would be matching part of unrelated /
1397 // not fully matching stack contexts. To do this, subtract any context ids
1398 // found in caller nodes of the last node found above.
1399 if (Ids.back() != getLastStackId(Call)) {
1400 for (const auto &PE : LastNode->CallerEdges) {
1401 set_subtract(StackSequenceContextIds, PE->getContextIds());
1402 if (StackSequenceContextIds.empty())
1403 break;
1404 }
1405 // If we now have no context ids for clone, skip this call.
1406 if (StackSequenceContextIds.empty())
1407 continue;
1408 }
1409
1410 // Check if the next set of stack ids is the same (since the Calls vector
1411 // of tuples is sorted by the stack ids we can just look at the next one).
1412 bool DuplicateContextIds = false;
1413 if (I + 1 < Calls.size()) {
1414 auto NextIds = std::get<1>(Calls[I + 1]);
1415 DuplicateContextIds = Ids == NextIds;
1416 }
1417
1418 // If we don't have duplicate context ids, then we can assign all the
1419 // context ids computed for the original node sequence to this call.
1420 // If there are duplicate calls with the same stack ids then we synthesize
1421 // new context ids that are duplicates of the originals. These are
1422 // assigned to SavedContextIds, which is a reference into the map entry
1423 // for this call, allowing us to access these ids later on.
1424 OldToNewContextIds.reserve(NumEntries: OldToNewContextIds.size() +
1425 StackSequenceContextIds.size());
1426 SavedContextIds =
1427 DuplicateContextIds
1428 ? duplicateContextIds(StackSequenceContextIds, OldToNewContextIds)
1429 : StackSequenceContextIds;
1430 assert(!SavedContextIds.empty());
1431
1432 if (!DuplicateContextIds) {
1433 // Update saved last node's context ids to remove those that are
1434 // assigned to other calls, so that it is ready for the next call at
1435 // this stack id.
1436 set_subtract(S1&: LastNodeContextIds, S2: StackSequenceContextIds);
1437 if (LastNodeContextIds.empty())
1438 break;
1439 }
1440 }
1441 }
1442
1443 // Propagate the duplicate context ids over the graph.
1444 propagateDuplicateContextIds(OldToNewContextIds);
1445
1446 if (VerifyCCG)
1447 check();
1448
1449 // Now perform a post-order traversal over the graph, starting with the
1450 // allocation nodes, essentially processing nodes from callers to callees.
1451 // For any that contains an id in the map, update the graph to contain new
1452 // nodes representing any inlining at interior callsites. Note we move the
1453 // associated context ids over to the new nodes.
1454 DenseSet<const ContextNode *> Visited;
1455 for (auto &Entry : AllocationCallToContextNodeMap)
1456 assignStackNodesPostOrder(Node: Entry.second, Visited, StackIdToMatchingCalls);
1457}
1458
1459uint64_t ModuleCallsiteContextGraph::getLastStackId(Instruction *Call) {
1460 CallStack<MDNode, MDNode::op_iterator> CallsiteContext(
1461 Call->getMetadata(KindID: LLVMContext::MD_callsite));
1462 return CallsiteContext.back();
1463}
1464
1465uint64_t IndexCallsiteContextGraph::getLastStackId(IndexCall &Call) {
1466 assert(isa<CallsiteInfo *>(Call.getBase()));
1467 CallStack<CallsiteInfo, SmallVector<unsigned>::const_iterator>
1468 CallsiteContext(dyn_cast_if_present<CallsiteInfo *>(Val: Call.getBase()));
1469 // Need to convert index into stack id.
1470 return Index.getStackIdAtIndex(Index: CallsiteContext.back());
1471}
1472
1473static const std::string MemProfCloneSuffix = ".memprof.";
1474
1475static std::string getMemProfFuncName(Twine Base, unsigned CloneNo) {
1476 // We use CloneNo == 0 to refer to the original version, which doesn't get
1477 // renamed with a suffix.
1478 if (!CloneNo)
1479 return Base.str();
1480 return (Base + MemProfCloneSuffix + Twine(CloneNo)).str();
1481}
1482
1483std::string ModuleCallsiteContextGraph::getLabel(const Function *Func,
1484 const Instruction *Call,
1485 unsigned CloneNo) const {
1486 return (Twine(Call->getFunction()->getName()) + " -> " +
1487 cast<CallBase>(Val: Call)->getCalledFunction()->getName())
1488 .str();
1489}
1490
1491std::string IndexCallsiteContextGraph::getLabel(const FunctionSummary *Func,
1492 const IndexCall &Call,
1493 unsigned CloneNo) const {
1494 auto VI = FSToVIMap.find(x: Func);
1495 assert(VI != FSToVIMap.end());
1496 if (isa<AllocInfo *>(Val: Call.getBase()))
1497 return (VI->second.name() + " -> alloc").str();
1498 else {
1499 auto *Callsite = dyn_cast_if_present<CallsiteInfo *>(Val: Call.getBase());
1500 return (VI->second.name() + " -> " +
1501 getMemProfFuncName(Base: Callsite->Callee.name(),
1502 CloneNo: Callsite->Clones[CloneNo]))
1503 .str();
1504 }
1505}
1506
1507std::vector<uint64_t>
1508ModuleCallsiteContextGraph::getStackIdsWithContextNodesForCall(
1509 Instruction *Call) {
1510 CallStack<MDNode, MDNode::op_iterator> CallsiteContext(
1511 Call->getMetadata(KindID: LLVMContext::MD_callsite));
1512 return getStackIdsWithContextNodes<MDNode, MDNode::op_iterator>(
1513 CallsiteContext);
1514}
1515
1516std::vector<uint64_t>
1517IndexCallsiteContextGraph::getStackIdsWithContextNodesForCall(IndexCall &Call) {
1518 assert(isa<CallsiteInfo *>(Call.getBase()));
1519 CallStack<CallsiteInfo, SmallVector<unsigned>::const_iterator>
1520 CallsiteContext(dyn_cast_if_present<CallsiteInfo *>(Val: Call.getBase()));
1521 return getStackIdsWithContextNodes<CallsiteInfo,
1522 SmallVector<unsigned>::const_iterator>(
1523 CallsiteContext);
1524}
1525
1526template <typename DerivedCCG, typename FuncTy, typename CallTy>
1527template <class NodeT, class IteratorT>
1528std::vector<uint64_t>
1529CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::getStackIdsWithContextNodes(
1530 CallStack<NodeT, IteratorT> &CallsiteContext) {
1531 std::vector<uint64_t> StackIds;
1532 for (auto IdOrIndex : CallsiteContext) {
1533 auto StackId = getStackId(IdOrIndex);
1534 ContextNode *Node = getNodeForStackId(StackId);
1535 if (!Node)
1536 break;
1537 StackIds.push_back(StackId);
1538 }
1539 return StackIds;
1540}
1541
1542ModuleCallsiteContextGraph::ModuleCallsiteContextGraph(
1543 Module &M,
1544 llvm::function_ref<OptimizationRemarkEmitter &(Function *)> OREGetter)
1545 : Mod(M), OREGetter(OREGetter) {
1546 for (auto &F : M) {
1547 std::vector<CallInfo> CallsWithMetadata;
1548 for (auto &BB : F) {
1549 for (auto &I : BB) {
1550 if (!isa<CallBase>(Val: I))
1551 continue;
1552 if (auto *MemProfMD = I.getMetadata(KindID: LLVMContext::MD_memprof)) {
1553 CallsWithMetadata.push_back(x: &I);
1554 auto *AllocNode = addAllocNode(Call: &I, F: &F);
1555 auto *CallsiteMD = I.getMetadata(KindID: LLVMContext::MD_callsite);
1556 assert(CallsiteMD);
1557 CallStack<MDNode, MDNode::op_iterator> CallsiteContext(CallsiteMD);
1558 // Add all of the MIBs and their stack nodes.
1559 for (auto &MDOp : MemProfMD->operands()) {
1560 auto *MIBMD = cast<const MDNode>(Val: MDOp);
1561 MDNode *StackNode = getMIBStackNode(MIB: MIBMD);
1562 assert(StackNode);
1563 CallStack<MDNode, MDNode::op_iterator> StackContext(StackNode);
1564 addStackNodesForMIB<MDNode, MDNode::op_iterator>(
1565 AllocNode, StackContext, CallsiteContext,
1566 AllocType: getMIBAllocType(MIB: MIBMD));
1567 }
1568 assert(AllocNode->AllocTypes != (uint8_t)AllocationType::None);
1569 // Memprof and callsite metadata on memory allocations no longer
1570 // needed.
1571 I.setMetadata(KindID: LLVMContext::MD_memprof, Node: nullptr);
1572 I.setMetadata(KindID: LLVMContext::MD_callsite, Node: nullptr);
1573 }
1574 // For callsite metadata, add to list for this function for later use.
1575 else if (I.getMetadata(KindID: LLVMContext::MD_callsite))
1576 CallsWithMetadata.push_back(x: &I);
1577 }
1578 }
1579 if (!CallsWithMetadata.empty())
1580 FuncToCallsWithMetadata[&F] = CallsWithMetadata;
1581 }
1582
1583 if (DumpCCG) {
1584 dbgs() << "CCG before updating call stack chains:\n";
1585 dbgs() << *this;
1586 }
1587
1588 if (ExportToDot)
1589 exportToDot(Label: "prestackupdate");
1590
1591 updateStackNodes();
1592
1593 handleCallsitesWithMultipleTargets();
1594
1595 // Strip off remaining callsite metadata, no longer needed.
1596 for (auto &FuncEntry : FuncToCallsWithMetadata)
1597 for (auto &Call : FuncEntry.second)
1598 Call.call()->setMetadata(KindID: LLVMContext::MD_callsite, Node: nullptr);
1599}
1600
1601IndexCallsiteContextGraph::IndexCallsiteContextGraph(
1602 ModuleSummaryIndex &Index,
1603 llvm::function_ref<bool(GlobalValue::GUID, const GlobalValueSummary *)>
1604 isPrevailing)
1605 : Index(Index), isPrevailing(isPrevailing) {
1606 for (auto &I : Index) {
1607 auto VI = Index.getValueInfo(R: I);
1608 for (auto &S : VI.getSummaryList()) {
1609 // We should only add the prevailing nodes. Otherwise we may try to clone
1610 // in a weak copy that won't be linked (and may be different than the
1611 // prevailing version).
1612 // We only keep the memprof summary on the prevailing copy now when
1613 // building the combined index, as a space optimization, however don't
1614 // rely on this optimization. The linker doesn't resolve local linkage
1615 // values so don't check whether those are prevailing.
1616 if (!GlobalValue::isLocalLinkage(Linkage: S->linkage()) &&
1617 !isPrevailing(VI.getGUID(), S.get()))
1618 continue;
1619 auto *FS = dyn_cast<FunctionSummary>(Val: S.get());
1620 if (!FS)
1621 continue;
1622 std::vector<CallInfo> CallsWithMetadata;
1623 if (!FS->allocs().empty()) {
1624 for (auto &AN : FS->mutableAllocs()) {
1625 // This can happen because of recursion elimination handling that
1626 // currently exists in ModuleSummaryAnalysis. Skip these for now.
1627 // We still added them to the summary because we need to be able to
1628 // correlate properly in applyImport in the backends.
1629 if (AN.MIBs.empty())
1630 continue;
1631 CallsWithMetadata.push_back(x: {&AN});
1632 auto *AllocNode = addAllocNode(Call: {&AN}, F: FS);
1633 // Pass an empty CallStack to the CallsiteContext (second)
1634 // parameter, since for ThinLTO we already collapsed out the inlined
1635 // stack ids on the allocation call during ModuleSummaryAnalysis.
1636 CallStack<MIBInfo, SmallVector<unsigned>::const_iterator>
1637 EmptyContext;
1638 // Now add all of the MIBs and their stack nodes.
1639 for (auto &MIB : AN.MIBs) {
1640 CallStack<MIBInfo, SmallVector<unsigned>::const_iterator>
1641 StackContext(&MIB);
1642 addStackNodesForMIB<MIBInfo, SmallVector<unsigned>::const_iterator>(
1643 AllocNode, StackContext, CallsiteContext&: EmptyContext, AllocType: MIB.AllocType);
1644 }
1645 assert(AllocNode->AllocTypes != (uint8_t)AllocationType::None);
1646 // Initialize version 0 on the summary alloc node to the current alloc
1647 // type, unless it has both types in which case make it default, so
1648 // that in the case where we aren't able to clone the original version
1649 // always ends up with the default allocation behavior.
1650 AN.Versions[0] = (uint8_t)allocTypeToUse(AllocTypes: AllocNode->AllocTypes);
1651 }
1652 }
1653 // For callsite metadata, add to list for this function for later use.
1654 if (!FS->callsites().empty())
1655 for (auto &SN : FS->mutableCallsites())
1656 CallsWithMetadata.push_back(x: {&SN});
1657
1658 if (!CallsWithMetadata.empty())
1659 FuncToCallsWithMetadata[FS] = CallsWithMetadata;
1660
1661 if (!FS->allocs().empty() || !FS->callsites().empty())
1662 FSToVIMap[FS] = VI;
1663 }
1664 }
1665
1666 if (DumpCCG) {
1667 dbgs() << "CCG before updating call stack chains:\n";
1668 dbgs() << *this;
1669 }
1670
1671 if (ExportToDot)
1672 exportToDot(Label: "prestackupdate");
1673
1674 updateStackNodes();
1675
1676 handleCallsitesWithMultipleTargets();
1677}
1678
1679template <typename DerivedCCG, typename FuncTy, typename CallTy>
1680void CallsiteContextGraph<DerivedCCG, FuncTy,
1681 CallTy>::handleCallsitesWithMultipleTargets() {
1682 // Look for and workaround callsites that call multiple functions.
1683 // This can happen for indirect calls, which needs better handling, and in
1684 // more rare cases (e.g. macro expansion).
1685 // TODO: To fix this for indirect calls we will want to perform speculative
1686 // devirtualization using either the normal PGO info with ICP, or using the
1687 // information in the profiled MemProf contexts. We can do this prior to
1688 // this transformation for regular LTO, and for ThinLTO we can simulate that
1689 // effect in the summary and perform the actual speculative devirtualization
1690 // while cloning in the ThinLTO backend.
1691
1692 // Keep track of the new nodes synthesized for discovered tail calls missing
1693 // from the profiled contexts.
1694 MapVector<CallInfo, ContextNode *> TailCallToContextNodeMap;
1695
1696 for (auto Entry = NonAllocationCallToContextNodeMap.begin();
1697 Entry != NonAllocationCallToContextNodeMap.end();) {
1698 auto *Node = Entry->second;
1699 assert(Node->Clones.empty());
1700 // Check all node callees and see if in the same function.
1701 bool Removed = false;
1702 auto Call = Node->Call.call();
1703 for (auto EI = Node->CalleeEdges.begin(); EI != Node->CalleeEdges.end();) {
1704 auto Edge = *EI;
1705 if (!Edge->Callee->hasCall()) {
1706 ++EI;
1707 continue;
1708 }
1709 assert(NodeToCallingFunc.count(Edge->Callee));
1710 // Check if the called function matches that of the callee node.
1711 if (calleesMatch(Call, EI, TailCallToContextNodeMap))
1712 continue;
1713 RemovedEdgesWithMismatchedCallees++;
1714 // Work around by setting Node to have a null call, so it gets
1715 // skipped during cloning. Otherwise assignFunctions will assert
1716 // because its data structures are not designed to handle this case.
1717 Entry = NonAllocationCallToContextNodeMap.erase(Entry);
1718 Node->setCall(CallInfo());
1719 Removed = true;
1720 break;
1721 }
1722 if (!Removed)
1723 Entry++;
1724 }
1725
1726 // Add the new nodes after the above loop so that the iteration is not
1727 // invalidated.
1728 for (auto &[Call, Node] : TailCallToContextNodeMap)
1729 NonAllocationCallToContextNodeMap[Call] = Node;
1730}
1731
1732uint64_t ModuleCallsiteContextGraph::getStackId(uint64_t IdOrIndex) const {
1733 // In the Module (IR) case this is already the Id.
1734 return IdOrIndex;
1735}
1736
1737uint64_t IndexCallsiteContextGraph::getStackId(uint64_t IdOrIndex) const {
1738 // In the Index case this is an index into the stack id list in the summary
1739 // index, convert it to an Id.
1740 return Index.getStackIdAtIndex(Index: IdOrIndex);
1741}
1742
1743template <typename DerivedCCG, typename FuncTy, typename CallTy>
1744bool CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::calleesMatch(
1745 CallTy Call, EdgeIter &EI,
1746 MapVector<CallInfo, ContextNode *> &TailCallToContextNodeMap) {
1747 auto Edge = *EI;
1748 const FuncTy *ProfiledCalleeFunc = NodeToCallingFunc[Edge->Callee];
1749 const FuncTy *CallerFunc = NodeToCallingFunc[Edge->Caller];
1750 // Will be populated in order of callee to caller if we find a chain of tail
1751 // calls between the profiled caller and callee.
1752 std::vector<std::pair<CallTy, FuncTy *>> FoundCalleeChain;
1753 if (!calleeMatchesFunc(Call, Func: ProfiledCalleeFunc, CallerFunc,
1754 FoundCalleeChain)) {
1755 ++EI;
1756 return false;
1757 }
1758
1759 // The usual case where the profiled callee matches that of the IR/summary.
1760 if (FoundCalleeChain.empty()) {
1761 ++EI;
1762 return true;
1763 }
1764
1765 auto AddEdge = [Edge, &EI](ContextNode *Caller, ContextNode *Callee) {
1766 auto *CurEdge = Callee->findEdgeFromCaller(Caller);
1767 // If there is already an edge between these nodes, simply update it and
1768 // return.
1769 if (CurEdge) {
1770 CurEdge->ContextIds.insert(Edge->ContextIds.begin(),
1771 Edge->ContextIds.end());
1772 CurEdge->AllocTypes |= Edge->AllocTypes;
1773 return;
1774 }
1775 // Otherwise, create a new edge and insert it into the caller and callee
1776 // lists.
1777 auto NewEdge = std::make_shared<ContextEdge>(
1778 Callee, Caller, Edge->AllocTypes, Edge->ContextIds);
1779 Callee->CallerEdges.push_back(NewEdge);
1780 if (Caller == Edge->Caller) {
1781 // If we are inserting the new edge into the current edge's caller, insert
1782 // the new edge before the current iterator position, and then increment
1783 // back to the current edge.
1784 EI = Caller->CalleeEdges.insert(EI, NewEdge);
1785 ++EI;
1786 assert(*EI == Edge &&
1787 "Iterator position not restored after insert and increment");
1788 } else
1789 Caller->CalleeEdges.push_back(NewEdge);
1790 };
1791
1792 // Create new nodes for each found callee and connect in between the profiled
1793 // caller and callee.
1794 auto *CurCalleeNode = Edge->Callee;
1795 for (auto &[NewCall, Func] : FoundCalleeChain) {
1796 ContextNode *NewNode = nullptr;
1797 // First check if we have already synthesized a node for this tail call.
1798 if (TailCallToContextNodeMap.count(NewCall)) {
1799 NewNode = TailCallToContextNodeMap[NewCall];
1800 NewNode->ContextIds.insert(Edge->ContextIds.begin(),
1801 Edge->ContextIds.end());
1802 NewNode->AllocTypes |= Edge->AllocTypes;
1803 } else {
1804 FuncToCallsWithMetadata[Func].push_back({NewCall});
1805 // Create Node and record node info.
1806 NodeOwner.push_back(
1807 std::make_unique<ContextNode>(/*IsAllocation=*/false, NewCall));
1808 NewNode = NodeOwner.back().get();
1809 NodeToCallingFunc[NewNode] = Func;
1810 TailCallToContextNodeMap[NewCall] = NewNode;
1811 NewNode->ContextIds = Edge->ContextIds;
1812 NewNode->AllocTypes = Edge->AllocTypes;
1813 }
1814
1815 // Hook up node to its callee node
1816 AddEdge(NewNode, CurCalleeNode);
1817
1818 CurCalleeNode = NewNode;
1819 }
1820
1821 // Hook up edge's original caller to new callee node.
1822 AddEdge(Edge->Caller, CurCalleeNode);
1823
1824 // Remove old edge
1825 Edge->Callee->eraseCallerEdge(Edge.get());
1826 EI = Edge->Caller->CalleeEdges.erase(EI);
1827
1828 return true;
1829}
1830
1831bool ModuleCallsiteContextGraph::findProfiledCalleeThroughTailCalls(
1832 const Function *ProfiledCallee, Value *CurCallee, unsigned Depth,
1833 std::vector<std::pair<Instruction *, Function *>> &FoundCalleeChain,
1834 bool &FoundMultipleCalleeChains) {
1835 // Stop recursive search if we have already explored the maximum specified
1836 // depth.
1837 if (Depth > TailCallSearchDepth)
1838 return false;
1839
1840 auto SaveCallsiteInfo = [&](Instruction *Callsite, Function *F) {
1841 FoundCalleeChain.push_back(x: {Callsite, F});
1842 };
1843
1844 auto *CalleeFunc = dyn_cast<Function>(Val: CurCallee);
1845 if (!CalleeFunc) {
1846 auto *Alias = dyn_cast<GlobalAlias>(Val: CurCallee);
1847 assert(Alias);
1848 CalleeFunc = dyn_cast<Function>(Val: Alias->getAliasee());
1849 assert(CalleeFunc);
1850 }
1851
1852 // Look for tail calls in this function, and check if they either call the
1853 // profiled callee directly, or indirectly (via a recursive search).
1854 // Only succeed if there is a single unique tail call chain found between the
1855 // profiled caller and callee, otherwise we could perform incorrect cloning.
1856 bool FoundSingleCalleeChain = false;
1857 for (auto &BB : *CalleeFunc) {
1858 for (auto &I : BB) {
1859 auto *CB = dyn_cast<CallBase>(Val: &I);
1860 if (!CB || !CB->isTailCall())
1861 continue;
1862 auto *CalledValue = CB->getCalledOperand();
1863 auto *CalledFunction = CB->getCalledFunction();
1864 if (CalledValue && !CalledFunction) {
1865 CalledValue = CalledValue->stripPointerCasts();
1866 // Stripping pointer casts can reveal a called function.
1867 CalledFunction = dyn_cast<Function>(Val: CalledValue);
1868 }
1869 // Check if this is an alias to a function. If so, get the
1870 // called aliasee for the checks below.
1871 if (auto *GA = dyn_cast<GlobalAlias>(Val: CalledValue)) {
1872 assert(!CalledFunction &&
1873 "Expected null called function in callsite for alias");
1874 CalledFunction = dyn_cast<Function>(Val: GA->getAliaseeObject());
1875 }
1876 if (!CalledFunction)
1877 continue;
1878 if (CalledFunction == ProfiledCallee) {
1879 if (FoundSingleCalleeChain) {
1880 FoundMultipleCalleeChains = true;
1881 return false;
1882 }
1883 FoundSingleCalleeChain = true;
1884 FoundProfiledCalleeCount++;
1885 FoundProfiledCalleeDepth += Depth;
1886 if (Depth > FoundProfiledCalleeMaxDepth)
1887 FoundProfiledCalleeMaxDepth = Depth;
1888 SaveCallsiteInfo(&I, CalleeFunc);
1889 } else if (findProfiledCalleeThroughTailCalls(
1890 ProfiledCallee, CurCallee: CalledFunction, Depth: Depth + 1,
1891 FoundCalleeChain, FoundMultipleCalleeChains)) {
1892 if (FoundMultipleCalleeChains)
1893 return false;
1894 if (FoundSingleCalleeChain) {
1895 FoundMultipleCalleeChains = true;
1896 return false;
1897 }
1898 FoundSingleCalleeChain = true;
1899 SaveCallsiteInfo(&I, CalleeFunc);
1900 }
1901 }
1902 }
1903
1904 return FoundSingleCalleeChain;
1905}
1906
1907bool ModuleCallsiteContextGraph::calleeMatchesFunc(
1908 Instruction *Call, const Function *Func, const Function *CallerFunc,
1909 std::vector<std::pair<Instruction *, Function *>> &FoundCalleeChain) {
1910 auto *CB = dyn_cast<CallBase>(Val: Call);
1911 if (!CB->getCalledOperand())
1912 return false;
1913 auto *CalleeVal = CB->getCalledOperand()->stripPointerCasts();
1914 auto *CalleeFunc = dyn_cast<Function>(Val: CalleeVal);
1915 if (CalleeFunc == Func)
1916 return true;
1917 auto *Alias = dyn_cast<GlobalAlias>(Val: CalleeVal);
1918 if (Alias && Alias->getAliasee() == Func)
1919 return true;
1920
1921 // Recursively search for the profiled callee through tail calls starting with
1922 // the actual Callee. The discovered tail call chain is saved in
1923 // FoundCalleeChain, and we will fixup the graph to include these callsites
1924 // after returning.
1925 // FIXME: We will currently redo the same recursive walk if we find the same
1926 // mismatched callee from another callsite. We can improve this with more
1927 // bookkeeping of the created chain of new nodes for each mismatch.
1928 unsigned Depth = 1;
1929 bool FoundMultipleCalleeChains = false;
1930 if (!findProfiledCalleeThroughTailCalls(ProfiledCallee: Func, CurCallee: CalleeVal, Depth,
1931 FoundCalleeChain,
1932 FoundMultipleCalleeChains)) {
1933 LLVM_DEBUG(dbgs() << "Not found through unique tail call chain: "
1934 << Func->getName() << " from " << CallerFunc->getName()
1935 << " that actually called " << CalleeVal->getName()
1936 << (FoundMultipleCalleeChains
1937 ? " (found multiple possible chains)"
1938 : "")
1939 << "\n");
1940 if (FoundMultipleCalleeChains)
1941 FoundProfiledCalleeNonUniquelyCount++;
1942 return false;
1943 }
1944
1945 return true;
1946}
1947
1948bool IndexCallsiteContextGraph::findProfiledCalleeThroughTailCalls(
1949 ValueInfo ProfiledCallee, ValueInfo CurCallee, unsigned Depth,
1950 std::vector<std::pair<IndexCall, FunctionSummary *>> &FoundCalleeChain,
1951 bool &FoundMultipleCalleeChains) {
1952 // Stop recursive search if we have already explored the maximum specified
1953 // depth.
1954 if (Depth > TailCallSearchDepth)
1955 return false;
1956
1957 auto CreateAndSaveCallsiteInfo = [&](ValueInfo Callee, FunctionSummary *FS) {
1958 // Make a CallsiteInfo for each discovered callee, if one hasn't already
1959 // been synthesized.
1960 if (!FunctionCalleesToSynthesizedCallsiteInfos.count(x: FS) ||
1961 !FunctionCalleesToSynthesizedCallsiteInfos[FS].count(x: Callee))
1962 // StackIds is empty (we don't have debug info available in the index for
1963 // these callsites)
1964 FunctionCalleesToSynthesizedCallsiteInfos[FS][Callee] =
1965 std::make_unique<CallsiteInfo>(args&: Callee, args: SmallVector<unsigned>());
1966 CallsiteInfo *NewCallsiteInfo =
1967 FunctionCalleesToSynthesizedCallsiteInfos[FS][Callee].get();
1968 FoundCalleeChain.push_back(x: {NewCallsiteInfo, FS});
1969 };
1970
1971 // Look for tail calls in this function, and check if they either call the
1972 // profiled callee directly, or indirectly (via a recursive search).
1973 // Only succeed if there is a single unique tail call chain found between the
1974 // profiled caller and callee, otherwise we could perform incorrect cloning.
1975 bool FoundSingleCalleeChain = false;
1976 for (auto &S : CurCallee.getSummaryList()) {
1977 if (!GlobalValue::isLocalLinkage(Linkage: S->linkage()) &&
1978 !isPrevailing(CurCallee.getGUID(), S.get()))
1979 continue;
1980 auto *FS = dyn_cast<FunctionSummary>(Val: S->getBaseObject());
1981 if (!FS)
1982 continue;
1983 auto FSVI = CurCallee;
1984 auto *AS = dyn_cast<AliasSummary>(Val: S.get());
1985 if (AS)
1986 FSVI = AS->getAliaseeVI();
1987 for (auto &CallEdge : FS->calls()) {
1988 if (!CallEdge.second.hasTailCall())
1989 continue;
1990 if (CallEdge.first == ProfiledCallee) {
1991 if (FoundSingleCalleeChain) {
1992 FoundMultipleCalleeChains = true;
1993 return false;
1994 }
1995 FoundSingleCalleeChain = true;
1996 FoundProfiledCalleeCount++;
1997 FoundProfiledCalleeDepth += Depth;
1998 if (Depth > FoundProfiledCalleeMaxDepth)
1999 FoundProfiledCalleeMaxDepth = Depth;
2000 CreateAndSaveCallsiteInfo(CallEdge.first, FS);
2001 // Add FS to FSToVIMap in case it isn't already there.
2002 assert(!FSToVIMap.count(FS) || FSToVIMap[FS] == FSVI);
2003 FSToVIMap[FS] = FSVI;
2004 } else if (findProfiledCalleeThroughTailCalls(
2005 ProfiledCallee, CurCallee: CallEdge.first, Depth: Depth + 1,
2006 FoundCalleeChain, FoundMultipleCalleeChains)) {
2007 if (FoundMultipleCalleeChains)
2008 return false;
2009 if (FoundSingleCalleeChain) {
2010 FoundMultipleCalleeChains = true;
2011 return false;
2012 }
2013 FoundSingleCalleeChain = true;
2014 CreateAndSaveCallsiteInfo(CallEdge.first, FS);
2015 // Add FS to FSToVIMap in case it isn't already there.
2016 assert(!FSToVIMap.count(FS) || FSToVIMap[FS] == FSVI);
2017 FSToVIMap[FS] = FSVI;
2018 }
2019 }
2020 }
2021
2022 return FoundSingleCalleeChain;
2023}
2024
2025bool IndexCallsiteContextGraph::calleeMatchesFunc(
2026 IndexCall &Call, const FunctionSummary *Func,
2027 const FunctionSummary *CallerFunc,
2028 std::vector<std::pair<IndexCall, FunctionSummary *>> &FoundCalleeChain) {
2029 ValueInfo Callee =
2030 dyn_cast_if_present<CallsiteInfo *>(Val: Call.getBase())->Callee;
2031 // If there is no summary list then this is a call to an externally defined
2032 // symbol.
2033 AliasSummary *Alias =
2034 Callee.getSummaryList().empty()
2035 ? nullptr
2036 : dyn_cast<AliasSummary>(Val: Callee.getSummaryList()[0].get());
2037 assert(FSToVIMap.count(Func));
2038 auto FuncVI = FSToVIMap[Func];
2039 if (Callee == FuncVI ||
2040 // If callee is an alias, check the aliasee, since only function
2041 // summary base objects will contain the stack node summaries and thus
2042 // get a context node.
2043 (Alias && Alias->getAliaseeVI() == FuncVI))
2044 return true;
2045
2046 // Recursively search for the profiled callee through tail calls starting with
2047 // the actual Callee. The discovered tail call chain is saved in
2048 // FoundCalleeChain, and we will fixup the graph to include these callsites
2049 // after returning.
2050 // FIXME: We will currently redo the same recursive walk if we find the same
2051 // mismatched callee from another callsite. We can improve this with more
2052 // bookkeeping of the created chain of new nodes for each mismatch.
2053 unsigned Depth = 1;
2054 bool FoundMultipleCalleeChains = false;
2055 if (!findProfiledCalleeThroughTailCalls(
2056 ProfiledCallee: FuncVI, CurCallee: Callee, Depth, FoundCalleeChain, FoundMultipleCalleeChains)) {
2057 LLVM_DEBUG(dbgs() << "Not found through unique tail call chain: " << FuncVI
2058 << " from " << FSToVIMap[CallerFunc]
2059 << " that actually called " << Callee
2060 << (FoundMultipleCalleeChains
2061 ? " (found multiple possible chains)"
2062 : "")
2063 << "\n");
2064 if (FoundMultipleCalleeChains)
2065 FoundProfiledCalleeNonUniquelyCount++;
2066 return false;
2067 }
2068
2069 return true;
2070}
2071
2072static std::string getAllocTypeString(uint8_t AllocTypes) {
2073 if (!AllocTypes)
2074 return "None";
2075 std::string Str;
2076 if (AllocTypes & (uint8_t)AllocationType::NotCold)
2077 Str += "NotCold";
2078 if (AllocTypes & (uint8_t)AllocationType::Cold)
2079 Str += "Cold";
2080 return Str;
2081}
2082
2083template <typename DerivedCCG, typename FuncTy, typename CallTy>
2084void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::ContextNode::dump()
2085 const {
2086 print(OS&: dbgs());
2087 dbgs() << "\n";
2088}
2089
2090template <typename DerivedCCG, typename FuncTy, typename CallTy>
2091void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::ContextNode::print(
2092 raw_ostream &OS) const {
2093 OS << "Node " << this << "\n";
2094 OS << "\t";
2095 printCall(OS);
2096 if (Recursive)
2097 OS << " (recursive)";
2098 OS << "\n";
2099 OS << "\tAllocTypes: " << getAllocTypeString(AllocTypes) << "\n";
2100 OS << "\tContextIds:";
2101 std::vector<uint32_t> SortedIds(ContextIds.begin(), ContextIds.end());
2102 std::sort(first: SortedIds.begin(), last: SortedIds.end());
2103 for (auto Id : SortedIds)
2104 OS << " " << Id;
2105 OS << "\n";
2106 OS << "\tCalleeEdges:\n";
2107 for (auto &Edge : CalleeEdges)
2108 OS << "\t\t" << *Edge << "\n";
2109 OS << "\tCallerEdges:\n";
2110 for (auto &Edge : CallerEdges)
2111 OS << "\t\t" << *Edge << "\n";
2112 if (!Clones.empty()) {
2113 OS << "\tClones: ";
2114 FieldSeparator FS;
2115 for (auto *Clone : Clones)
2116 OS << FS << Clone;
2117 OS << "\n";
2118 } else if (CloneOf) {
2119 OS << "\tClone of " << CloneOf << "\n";
2120 }
2121}
2122
2123template <typename DerivedCCG, typename FuncTy, typename CallTy>
2124void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::ContextEdge::dump()
2125 const {
2126 print(OS&: dbgs());
2127 dbgs() << "\n";
2128}
2129
2130template <typename DerivedCCG, typename FuncTy, typename CallTy>
2131void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::ContextEdge::print(
2132 raw_ostream &OS) const {
2133 OS << "Edge from Callee " << Callee << " to Caller: " << Caller
2134 << " AllocTypes: " << getAllocTypeString(AllocTypes);
2135 OS << " ContextIds:";
2136 std::vector<uint32_t> SortedIds(ContextIds.begin(), ContextIds.end());
2137 std::sort(first: SortedIds.begin(), last: SortedIds.end());
2138 for (auto Id : SortedIds)
2139 OS << " " << Id;
2140}
2141
2142template <typename DerivedCCG, typename FuncTy, typename CallTy>
2143void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::dump() const {
2144 print(OS&: dbgs());
2145}
2146
2147template <typename DerivedCCG, typename FuncTy, typename CallTy>
2148void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::print(
2149 raw_ostream &OS) const {
2150 OS << "Callsite Context Graph:\n";
2151 using GraphType = const CallsiteContextGraph<DerivedCCG, FuncTy, CallTy> *;
2152 for (const auto Node : nodes<GraphType>(this)) {
2153 if (Node->isRemoved())
2154 continue;
2155 Node->print(OS);
2156 OS << "\n";
2157 }
2158}
2159
2160template <typename DerivedCCG, typename FuncTy, typename CallTy>
2161static void checkEdge(
2162 const std::shared_ptr<ContextEdge<DerivedCCG, FuncTy, CallTy>> &Edge) {
2163 // Confirm that alloc type is not None and that we have at least one context
2164 // id.
2165 assert(Edge->AllocTypes != (uint8_t)AllocationType::None);
2166 assert(!Edge->ContextIds.empty());
2167}
2168
2169template <typename DerivedCCG, typename FuncTy, typename CallTy>
2170static void checkNode(const ContextNode<DerivedCCG, FuncTy, CallTy> *Node,
2171 bool CheckEdges = true) {
2172 if (Node->isRemoved())
2173 return;
2174 // Node's context ids should be the union of both its callee and caller edge
2175 // context ids.
2176 if (Node->CallerEdges.size()) {
2177 auto EI = Node->CallerEdges.begin();
2178 auto &FirstEdge = *EI;
2179 EI++;
2180 DenseSet<uint32_t> CallerEdgeContextIds(FirstEdge->ContextIds);
2181 for (; EI != Node->CallerEdges.end(); EI++) {
2182 const auto &Edge = *EI;
2183 if (CheckEdges)
2184 checkEdge<DerivedCCG, FuncTy, CallTy>(Edge);
2185 set_union(CallerEdgeContextIds, Edge->ContextIds);
2186 }
2187 // Node can have more context ids than callers if some contexts terminate at
2188 // node and some are longer.
2189 assert(Node->ContextIds == CallerEdgeContextIds ||
2190 set_is_subset(CallerEdgeContextIds, Node->ContextIds));
2191 }
2192 if (Node->CalleeEdges.size()) {
2193 auto EI = Node->CalleeEdges.begin();
2194 auto &FirstEdge = *EI;
2195 EI++;
2196 DenseSet<uint32_t> CalleeEdgeContextIds(FirstEdge->ContextIds);
2197 for (; EI != Node->CalleeEdges.end(); EI++) {
2198 const auto &Edge = *EI;
2199 if (CheckEdges)
2200 checkEdge<DerivedCCG, FuncTy, CallTy>(Edge);
2201 set_union(CalleeEdgeContextIds, Edge->ContextIds);
2202 }
2203 assert(Node->ContextIds == CalleeEdgeContextIds);
2204 }
2205}
2206
2207template <typename DerivedCCG, typename FuncTy, typename CallTy>
2208void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::check() const {
2209 using GraphType = const CallsiteContextGraph<DerivedCCG, FuncTy, CallTy> *;
2210 for (const auto Node : nodes<GraphType>(this)) {
2211 checkNode<DerivedCCG, FuncTy, CallTy>(Node, /*CheckEdges=*/false);
2212 for (auto &Edge : Node->CallerEdges)
2213 checkEdge<DerivedCCG, FuncTy, CallTy>(Edge);
2214 }
2215}
2216
2217template <typename DerivedCCG, typename FuncTy, typename CallTy>
2218struct GraphTraits<const CallsiteContextGraph<DerivedCCG, FuncTy, CallTy> *> {
2219 using GraphType = const CallsiteContextGraph<DerivedCCG, FuncTy, CallTy> *;
2220 using NodeRef = const ContextNode<DerivedCCG, FuncTy, CallTy> *;
2221
2222 using NodePtrTy = std::unique_ptr<ContextNode<DerivedCCG, FuncTy, CallTy>>;
2223 static NodeRef getNode(const NodePtrTy &P) { return P.get(); }
2224
2225 using nodes_iterator =
2226 mapped_iterator<typename std::vector<NodePtrTy>::const_iterator,
2227 decltype(&getNode)>;
2228
2229 static nodes_iterator nodes_begin(GraphType G) {
2230 return nodes_iterator(G->NodeOwner.begin(), &getNode);
2231 }
2232
2233 static nodes_iterator nodes_end(GraphType G) {
2234 return nodes_iterator(G->NodeOwner.end(), &getNode);
2235 }
2236
2237 static NodeRef getEntryNode(GraphType G) {
2238 return G->NodeOwner.begin()->get();
2239 }
2240
2241 using EdgePtrTy = std::shared_ptr<ContextEdge<DerivedCCG, FuncTy, CallTy>>;
2242 static const ContextNode<DerivedCCG, FuncTy, CallTy> *
2243 GetCallee(const EdgePtrTy &P) {
2244 return P->Callee;
2245 }
2246
2247 using ChildIteratorType =
2248 mapped_iterator<typename std::vector<std::shared_ptr<ContextEdge<
2249 DerivedCCG, FuncTy, CallTy>>>::const_iterator,
2250 decltype(&GetCallee)>;
2251
2252 static ChildIteratorType child_begin(NodeRef N) {
2253 return ChildIteratorType(N->CalleeEdges.begin(), &GetCallee);
2254 }
2255
2256 static ChildIteratorType child_end(NodeRef N) {
2257 return ChildIteratorType(N->CalleeEdges.end(), &GetCallee);
2258 }
2259};
2260
2261template <typename DerivedCCG, typename FuncTy, typename CallTy>
2262struct DOTGraphTraits<const CallsiteContextGraph<DerivedCCG, FuncTy, CallTy> *>
2263 : public DefaultDOTGraphTraits {
2264 DOTGraphTraits(bool IsSimple = false) : DefaultDOTGraphTraits(IsSimple) {}
2265
2266 using GraphType = const CallsiteContextGraph<DerivedCCG, FuncTy, CallTy> *;
2267 using GTraits = GraphTraits<GraphType>;
2268 using NodeRef = typename GTraits::NodeRef;
2269 using ChildIteratorType = typename GTraits::ChildIteratorType;
2270
2271 static std::string getNodeLabel(NodeRef Node, GraphType G) {
2272 std::string LabelString =
2273 (Twine("OrigId: ") + (Node->IsAllocation ? "Alloc" : "") +
2274 Twine(Node->OrigStackOrAllocId))
2275 .str();
2276 LabelString += "\n";
2277 if (Node->hasCall()) {
2278 auto Func = G->NodeToCallingFunc.find(Node);
2279 assert(Func != G->NodeToCallingFunc.end());
2280 LabelString +=
2281 G->getLabel(Func->second, Node->Call.call(), Node->Call.cloneNo());
2282 } else {
2283 LabelString += "null call";
2284 if (Node->Recursive)
2285 LabelString += " (recursive)";
2286 else
2287 LabelString += " (external)";
2288 }
2289 return LabelString;
2290 }
2291
2292 static std::string getNodeAttributes(NodeRef Node, GraphType) {
2293 std::string AttributeString = (Twine("tooltip=\"") + getNodeId(Node) + " " +
2294 getContextIds(ContextIds: Node->ContextIds) + "\"")
2295 .str();
2296 AttributeString +=
2297 (Twine(",fillcolor=\"") + getColor(AllocTypes: Node->AllocTypes) + "\"").str();
2298 AttributeString += ",style=\"filled\"";
2299 if (Node->CloneOf) {
2300 AttributeString += ",color=\"blue\"";
2301 AttributeString += ",style=\"filled,bold,dashed\"";
2302 } else
2303 AttributeString += ",style=\"filled\"";
2304 return AttributeString;
2305 }
2306
2307 static std::string getEdgeAttributes(NodeRef, ChildIteratorType ChildIter,
2308 GraphType) {
2309 auto &Edge = *(ChildIter.getCurrent());
2310 return (Twine("tooltip=\"") + getContextIds(ContextIds: Edge->ContextIds) + "\"" +
2311 Twine(",fillcolor=\"") + getColor(AllocTypes: Edge->AllocTypes) + "\"")
2312 .str();
2313 }
2314
2315 // Since the NodeOwners list includes nodes that are no longer connected to
2316 // the graph, skip them here.
2317 static bool isNodeHidden(NodeRef Node, GraphType) {
2318 return Node->isRemoved();
2319 }
2320
2321private:
2322 static std::string getContextIds(const DenseSet<uint32_t> &ContextIds) {
2323 std::string IdString = "ContextIds:";
2324 if (ContextIds.size() < 100) {
2325 std::vector<uint32_t> SortedIds(ContextIds.begin(), ContextIds.end());
2326 std::sort(first: SortedIds.begin(), last: SortedIds.end());
2327 for (auto Id : SortedIds)
2328 IdString += (" " + Twine(Id)).str();
2329 } else {
2330 IdString += (" (" + Twine(ContextIds.size()) + " ids)").str();
2331 }
2332 return IdString;
2333 }
2334
2335 static std::string getColor(uint8_t AllocTypes) {
2336 if (AllocTypes == (uint8_t)AllocationType::NotCold)
2337 // Color "brown1" actually looks like a lighter red.
2338 return "brown1";
2339 if (AllocTypes == (uint8_t)AllocationType::Cold)
2340 return "cyan";
2341 if (AllocTypes ==
2342 ((uint8_t)AllocationType::NotCold | (uint8_t)AllocationType::Cold))
2343 // Lighter purple.
2344 return "mediumorchid1";
2345 return "gray";
2346 }
2347
2348 static std::string getNodeId(NodeRef Node) {
2349 std::stringstream SStream;
2350 SStream << std::hex << "N0x" << (unsigned long long)Node;
2351 std::string Result = SStream.str();
2352 return Result;
2353 }
2354};
2355
2356template <typename DerivedCCG, typename FuncTy, typename CallTy>
2357void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::exportToDot(
2358 std::string Label) const {
2359 WriteGraph(this, "", false, Label,
2360 DotFilePathPrefix + "ccg." + Label + ".dot");
2361}
2362
2363template <typename DerivedCCG, typename FuncTy, typename CallTy>
2364typename CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::ContextNode *
2365CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::moveEdgeToNewCalleeClone(
2366 const std::shared_ptr<ContextEdge> &Edge, EdgeIter *CallerEdgeI,
2367 DenseSet<uint32_t> ContextIdsToMove) {
2368 ContextNode *Node = Edge->Callee;
2369 NodeOwner.push_back(
2370 std::make_unique<ContextNode>(Node->IsAllocation, Node->Call));
2371 ContextNode *Clone = NodeOwner.back().get();
2372 Node->addClone(Clone);
2373 assert(NodeToCallingFunc.count(Node));
2374 NodeToCallingFunc[Clone] = NodeToCallingFunc[Node];
2375 moveEdgeToExistingCalleeClone(Edge, NewCallee: Clone, CallerEdgeI, /*NewClone=*/true,
2376 ContextIdsToMove);
2377 return Clone;
2378}
2379
2380template <typename DerivedCCG, typename FuncTy, typename CallTy>
2381void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::
2382 moveEdgeToExistingCalleeClone(const std::shared_ptr<ContextEdge> &Edge,
2383 ContextNode *NewCallee, EdgeIter *CallerEdgeI,
2384 bool NewClone,
2385 DenseSet<uint32_t> ContextIdsToMove) {
2386 // NewCallee and Edge's current callee must be clones of the same original
2387 // node (Edge's current callee may be the original node too).
2388 assert(NewCallee->getOrigNode() == Edge->Callee->getOrigNode());
2389
2390 ContextNode *OldCallee = Edge->Callee;
2391
2392 // We might already have an edge to the new callee from earlier cloning for a
2393 // different allocation. If one exists we will reuse it.
2394 auto ExistingEdgeToNewCallee = NewCallee->findEdgeFromCaller(Edge->Caller);
2395
2396 // Callers will pass an empty ContextIdsToMove set when they want to move the
2397 // edge. Copy in Edge's ids for simplicity.
2398 if (ContextIdsToMove.empty())
2399 ContextIdsToMove = Edge->getContextIds();
2400
2401 // If we are moving all of Edge's ids, then just move the whole Edge.
2402 // Otherwise only move the specified subset, to a new edge if needed.
2403 if (Edge->getContextIds().size() == ContextIdsToMove.size()) {
2404 // Moving the whole Edge.
2405 if (CallerEdgeI)
2406 *CallerEdgeI = OldCallee->CallerEdges.erase(*CallerEdgeI);
2407 else
2408 OldCallee->eraseCallerEdge(Edge.get());
2409 if (ExistingEdgeToNewCallee) {
2410 // Since we already have an edge to NewCallee, simply move the ids
2411 // onto it, and remove the existing Edge.
2412 ExistingEdgeToNewCallee->getContextIds().insert(ContextIdsToMove.begin(),
2413 ContextIdsToMove.end());
2414 ExistingEdgeToNewCallee->AllocTypes |= Edge->AllocTypes;
2415 assert(Edge->ContextIds == ContextIdsToMove);
2416 Edge->ContextIds.clear();
2417 Edge->AllocTypes = (uint8_t)AllocationType::None;
2418 Edge->Caller->eraseCalleeEdge(Edge.get());
2419 } else {
2420 // Otherwise just reconnect Edge to NewCallee.
2421 Edge->Callee = NewCallee;
2422 NewCallee->CallerEdges.push_back(Edge);
2423 // Don't need to update Edge's context ids since we are simply
2424 // reconnecting it.
2425 }
2426 // In either case, need to update the alloc types on New Callee.
2427 NewCallee->AllocTypes |= Edge->AllocTypes;
2428 } else {
2429 // Only moving a subset of Edge's ids.
2430 if (CallerEdgeI)
2431 ++CallerEdgeI;
2432 // Compute the alloc type of the subset of ids being moved.
2433 auto CallerEdgeAllocType = computeAllocType(ContextIds&: ContextIdsToMove);
2434 if (ExistingEdgeToNewCallee) {
2435 // Since we already have an edge to NewCallee, simply move the ids
2436 // onto it.
2437 ExistingEdgeToNewCallee->getContextIds().insert(ContextIdsToMove.begin(),
2438 ContextIdsToMove.end());
2439 ExistingEdgeToNewCallee->AllocTypes |= CallerEdgeAllocType;
2440 } else {
2441 // Otherwise, create a new edge to NewCallee for the ids being moved.
2442 auto NewEdge = std::make_shared<ContextEdge>(
2443 NewCallee, Edge->Caller, CallerEdgeAllocType, ContextIdsToMove);
2444 Edge->Caller->CalleeEdges.push_back(NewEdge);
2445 NewCallee->CallerEdges.push_back(NewEdge);
2446 }
2447 // In either case, need to update the alloc types on NewCallee, and remove
2448 // those ids and update the alloc type on the original Edge.
2449 NewCallee->AllocTypes |= CallerEdgeAllocType;
2450 set_subtract(Edge->ContextIds, ContextIdsToMove);
2451 Edge->AllocTypes = computeAllocType(ContextIds&: Edge->ContextIds);
2452 }
2453 // Now perform some updates that are common to all cases: the NewCallee gets
2454 // the moved ids added, and we need to remove those ids from OldCallee and
2455 // update its alloc type (NewCallee alloc type updates handled above).
2456 NewCallee->ContextIds.insert(ContextIdsToMove.begin(),
2457 ContextIdsToMove.end());
2458 set_subtract(OldCallee->ContextIds, ContextIdsToMove);
2459 OldCallee->AllocTypes = computeAllocType(ContextIds&: OldCallee->ContextIds);
2460 // OldCallee alloc type should be None iff its context id set is now empty.
2461 assert((OldCallee->AllocTypes == (uint8_t)AllocationType::None) ==
2462 OldCallee->ContextIds.empty());
2463 // Now walk the old callee node's callee edges and move Edge's context ids
2464 // over to the corresponding edge into the clone (which is created here if
2465 // this is a newly created clone).
2466 for (auto &OldCalleeEdge : OldCallee->CalleeEdges) {
2467 // The context ids moving to the new callee are the subset of this edge's
2468 // context ids and the context ids on the caller edge being moved.
2469 DenseSet<uint32_t> EdgeContextIdsToMove =
2470 set_intersection(OldCalleeEdge->getContextIds(), ContextIdsToMove);
2471 set_subtract(OldCalleeEdge->getContextIds(), EdgeContextIdsToMove);
2472 OldCalleeEdge->AllocTypes =
2473 computeAllocType(ContextIds&: OldCalleeEdge->getContextIds());
2474 if (!NewClone) {
2475 // Update context ids / alloc type on corresponding edge to NewCallee.
2476 // There is a chance this may not exist if we are reusing an existing
2477 // clone, specifically during function assignment, where we would have
2478 // removed none type edges after creating the clone. If we can't find
2479 // a corresponding edge there, fall through to the cloning below.
2480 if (auto *NewCalleeEdge =
2481 NewCallee->findEdgeFromCallee(OldCalleeEdge->Callee)) {
2482 NewCalleeEdge->getContextIds().insert(EdgeContextIdsToMove.begin(),
2483 EdgeContextIdsToMove.end());
2484 NewCalleeEdge->AllocTypes |= computeAllocType(ContextIds&: EdgeContextIdsToMove);
2485 continue;
2486 }
2487 }
2488 auto NewEdge = std::make_shared<ContextEdge>(
2489 OldCalleeEdge->Callee, NewCallee,
2490 computeAllocType(ContextIds&: EdgeContextIdsToMove), EdgeContextIdsToMove);
2491 NewCallee->CalleeEdges.push_back(NewEdge);
2492 NewEdge->Callee->CallerEdges.push_back(NewEdge);
2493 }
2494 if (VerifyCCG) {
2495 checkNode<DerivedCCG, FuncTy, CallTy>(OldCallee, /*CheckEdges=*/false);
2496 checkNode<DerivedCCG, FuncTy, CallTy>(NewCallee, /*CheckEdges=*/false);
2497 for (const auto &OldCalleeEdge : OldCallee->CalleeEdges)
2498 checkNode<DerivedCCG, FuncTy, CallTy>(OldCalleeEdge->Callee,
2499 /*CheckEdges=*/false);
2500 for (const auto &NewCalleeEdge : NewCallee->CalleeEdges)
2501 checkNode<DerivedCCG, FuncTy, CallTy>(NewCalleeEdge->Callee,
2502 /*CheckEdges=*/false);
2503 }
2504}
2505
2506template <typename DerivedCCG, typename FuncTy, typename CallTy>
2507void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::
2508 recursivelyRemoveNoneTypeCalleeEdges(
2509 ContextNode *Node, DenseSet<const ContextNode *> &Visited) {
2510 auto Inserted = Visited.insert(Node);
2511 if (!Inserted.second)
2512 return;
2513
2514 removeNoneTypeCalleeEdges(Node);
2515
2516 for (auto *Clone : Node->Clones)
2517 recursivelyRemoveNoneTypeCalleeEdges(Node: Clone, Visited);
2518
2519 // The recursive call may remove some of this Node's caller edges.
2520 // Iterate over a copy and skip any that were removed.
2521 auto CallerEdges = Node->CallerEdges;
2522 for (auto &Edge : CallerEdges) {
2523 // Skip any that have been removed by an earlier recursive call.
2524 if (Edge->Callee == nullptr && Edge->Caller == nullptr) {
2525 assert(!std::count(Node->CallerEdges.begin(), Node->CallerEdges.end(),
2526 Edge));
2527 continue;
2528 }
2529 recursivelyRemoveNoneTypeCalleeEdges(Node: Edge->Caller, Visited);
2530 }
2531}
2532
2533template <typename DerivedCCG, typename FuncTy, typename CallTy>
2534void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::identifyClones() {
2535 DenseSet<const ContextNode *> Visited;
2536 for (auto &Entry : AllocationCallToContextNodeMap) {
2537 Visited.clear();
2538 identifyClones(Entry.second, Visited, Entry.second->ContextIds);
2539 }
2540 Visited.clear();
2541 for (auto &Entry : AllocationCallToContextNodeMap)
2542 recursivelyRemoveNoneTypeCalleeEdges(Node: Entry.second, Visited);
2543 if (VerifyCCG)
2544 check();
2545}
2546
2547// helper function to check an AllocType is cold or notcold or both.
2548bool checkColdOrNotCold(uint8_t AllocType) {
2549 return (AllocType == (uint8_t)AllocationType::Cold) ||
2550 (AllocType == (uint8_t)AllocationType::NotCold) ||
2551 (AllocType ==
2552 ((uint8_t)AllocationType::Cold | (uint8_t)AllocationType::NotCold));
2553}
2554
2555template <typename DerivedCCG, typename FuncTy, typename CallTy>
2556void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::identifyClones(
2557 ContextNode *Node, DenseSet<const ContextNode *> &Visited,
2558 const DenseSet<uint32_t> &AllocContextIds) {
2559 if (VerifyNodes)
2560 checkNode<DerivedCCG, FuncTy, CallTy>(Node, /*CheckEdges=*/false);
2561 assert(!Node->CloneOf);
2562
2563 // If Node as a null call, then either it wasn't found in the module (regular
2564 // LTO) or summary index (ThinLTO), or there were other conditions blocking
2565 // cloning (e.g. recursion, calls multiple targets, etc).
2566 // Do this here so that we don't try to recursively clone callers below, which
2567 // isn't useful at least for this node.
2568 if (!Node->hasCall())
2569 return;
2570
2571#ifndef NDEBUG
2572 auto Insert =
2573#endif
2574 Visited.insert(Node);
2575 // We should not have visited this node yet.
2576 assert(Insert.second);
2577 // The recursive call to identifyClones may delete the current edge from the
2578 // CallerEdges vector. Make a copy and iterate on that, simpler than passing
2579 // in an iterator and having recursive call erase from it. Other edges may
2580 // also get removed during the recursion, which will have null Callee and
2581 // Caller pointers (and are deleted later), so we skip those below.
2582 {
2583 auto CallerEdges = Node->CallerEdges;
2584 for (auto &Edge : CallerEdges) {
2585 // Skip any that have been removed by an earlier recursive call.
2586 if (Edge->Callee == nullptr && Edge->Caller == nullptr) {
2587 assert(!llvm::count(Node->CallerEdges, Edge));
2588 continue;
2589 }
2590 // Ignore any caller we previously visited via another edge.
2591 if (!Visited.count(Edge->Caller) && !Edge->Caller->CloneOf) {
2592 identifyClones(Edge->Caller, Visited, AllocContextIds);
2593 }
2594 }
2595 }
2596
2597 // Check if we reached an unambiguous call or have have only a single caller.
2598 if (hasSingleAllocType(Node->AllocTypes) || Node->CallerEdges.size() <= 1)
2599 return;
2600
2601 // We need to clone.
2602
2603 // Try to keep the original version as alloc type NotCold. This will make
2604 // cases with indirect calls or any other situation with an unknown call to
2605 // the original function get the default behavior. We do this by sorting the
2606 // CallerEdges of the Node we will clone by alloc type.
2607 //
2608 // Give NotCold edge the lowest sort priority so those edges are at the end of
2609 // the caller edges vector, and stay on the original version (since the below
2610 // code clones greedily until it finds all remaining edges have the same type
2611 // and leaves the remaining ones on the original Node).
2612 //
2613 // We shouldn't actually have any None type edges, so the sorting priority for
2614 // that is arbitrary, and we assert in that case below.
2615 const unsigned AllocTypeCloningPriority[] = {/*None*/ 3, /*NotCold*/ 4,
2616 /*Cold*/ 1,
2617 /*NotColdCold*/ 2};
2618 std::stable_sort(Node->CallerEdges.begin(), Node->CallerEdges.end(),
2619 [&](const std::shared_ptr<ContextEdge> &A,
2620 const std::shared_ptr<ContextEdge> &B) {
2621 // Nodes with non-empty context ids should be sorted before
2622 // those with empty context ids.
2623 if (A->ContextIds.empty())
2624 // Either B ContextIds are non-empty (in which case we
2625 // should return false because B < A), or B ContextIds
2626 // are empty, in which case they are equal, and we should
2627 // maintain the original relative ordering.
2628 return false;
2629 if (B->ContextIds.empty())
2630 return true;
2631
2632 if (A->AllocTypes == B->AllocTypes)
2633 // Use the first context id for each edge as a
2634 // tie-breaker.
2635 return *A->ContextIds.begin() < *B->ContextIds.begin();
2636 return AllocTypeCloningPriority[A->AllocTypes] <
2637 AllocTypeCloningPriority[B->AllocTypes];
2638 });
2639
2640 assert(Node->AllocTypes != (uint8_t)AllocationType::None);
2641
2642 // Iterate until we find no more opportunities for disambiguating the alloc
2643 // types via cloning. In most cases this loop will terminate once the Node
2644 // has a single allocation type, in which case no more cloning is needed.
2645 // We need to be able to remove Edge from CallerEdges, so need to adjust
2646 // iterator inside the loop.
2647 for (auto EI = Node->CallerEdges.begin(); EI != Node->CallerEdges.end();) {
2648 auto CallerEdge = *EI;
2649
2650 // See if cloning the prior caller edge left this node with a single alloc
2651 // type or a single caller. In that case no more cloning of Node is needed.
2652 if (hasSingleAllocType(Node->AllocTypes) || Node->CallerEdges.size() <= 1)
2653 break;
2654
2655 // Only need to process the ids along this edge pertaining to the given
2656 // allocation.
2657 auto CallerEdgeContextsForAlloc =
2658 set_intersection(CallerEdge->getContextIds(), AllocContextIds);
2659 if (CallerEdgeContextsForAlloc.empty()) {
2660 ++EI;
2661 continue;
2662 }
2663 auto CallerAllocTypeForAlloc = computeAllocType(ContextIds&: CallerEdgeContextsForAlloc);
2664
2665 // Compute the node callee edge alloc types corresponding to the context ids
2666 // for this caller edge.
2667 std::vector<uint8_t> CalleeEdgeAllocTypesForCallerEdge;
2668 CalleeEdgeAllocTypesForCallerEdge.reserve(n: Node->CalleeEdges.size());
2669 for (auto &CalleeEdge : Node->CalleeEdges)
2670 CalleeEdgeAllocTypesForCallerEdge.push_back(intersectAllocTypes(
2671 Node1Ids: CalleeEdge->getContextIds(), Node2Ids: CallerEdgeContextsForAlloc));
2672
2673 // Don't clone if doing so will not disambiguate any alloc types amongst
2674 // caller edges (including the callee edges that would be cloned).
2675 // Otherwise we will simply move all edges to the clone.
2676 //
2677 // First check if by cloning we will disambiguate the caller allocation
2678 // type from node's allocation type. Query allocTypeToUse so that we don't
2679 // bother cloning to distinguish NotCold+Cold from NotCold. Note that
2680 // neither of these should be None type.
2681 //
2682 // Then check if by cloning node at least one of the callee edges will be
2683 // disambiguated by splitting out different context ids.
2684 assert(CallerEdge->AllocTypes != (uint8_t)AllocationType::None);
2685 assert(Node->AllocTypes != (uint8_t)AllocationType::None);
2686 if (allocTypeToUse(CallerAllocTypeForAlloc) ==
2687 allocTypeToUse(Node->AllocTypes) &&
2688 allocTypesMatch<DerivedCCG, FuncTy, CallTy>(
2689 CalleeEdgeAllocTypesForCallerEdge, Node->CalleeEdges)) {
2690 ++EI;
2691 continue;
2692 }
2693
2694 // First see if we can use an existing clone. Check each clone and its
2695 // callee edges for matching alloc types.
2696 ContextNode *Clone = nullptr;
2697 for (auto *CurClone : Node->Clones) {
2698 if (allocTypeToUse(CurClone->AllocTypes) !=
2699 allocTypeToUse(CallerAllocTypeForAlloc))
2700 continue;
2701
2702 if (!allocTypesMatch<DerivedCCG, FuncTy, CallTy>(
2703 CalleeEdgeAllocTypesForCallerEdge, CurClone->CalleeEdges))
2704 continue;
2705 Clone = CurClone;
2706 break;
2707 }
2708
2709 // The edge iterator is adjusted when we move the CallerEdge to the clone.
2710 if (Clone)
2711 moveEdgeToExistingCalleeClone(Edge: CallerEdge, NewCallee: Clone, CallerEdgeI: &EI, /*NewClone=*/false,
2712 ContextIdsToMove: CallerEdgeContextsForAlloc);
2713 else
2714 Clone =
2715 moveEdgeToNewCalleeClone(Edge: CallerEdge, CallerEdgeI: &EI, ContextIdsToMove: CallerEdgeContextsForAlloc);
2716
2717 assert(EI == Node->CallerEdges.end() ||
2718 Node->AllocTypes != (uint8_t)AllocationType::None);
2719 // Sanity check that no alloc types on clone or its edges are None.
2720 assert(Clone->AllocTypes != (uint8_t)AllocationType::None);
2721 }
2722
2723 // We should still have some context ids on the original Node.
2724 assert(!Node->ContextIds.empty());
2725
2726 // Sanity check that no alloc types on node or edges are None.
2727 assert(Node->AllocTypes != (uint8_t)AllocationType::None);
2728
2729 if (VerifyNodes)
2730 checkNode<DerivedCCG, FuncTy, CallTy>(Node, /*CheckEdges=*/false);
2731}
2732
2733void ModuleCallsiteContextGraph::updateAllocationCall(
2734 CallInfo &Call, AllocationType AllocType) {
2735 std::string AllocTypeString = getAllocTypeAttributeString(Type: AllocType);
2736 auto A = llvm::Attribute::get(Context&: Call.call()->getFunction()->getContext(),
2737 Kind: "memprof", Val: AllocTypeString);
2738 cast<CallBase>(Val: Call.call())->addFnAttr(Attr: A);
2739 OREGetter(Call.call()->getFunction())
2740 .emit(OptDiag&: OptimizationRemark(DEBUG_TYPE, "MemprofAttribute", Call.call())
2741 << ore::NV("AllocationCall", Call.call()) << " in clone "
2742 << ore::NV("Caller", Call.call()->getFunction())
2743 << " marked with memprof allocation attribute "
2744 << ore::NV("Attribute", AllocTypeString));
2745}
2746
2747void IndexCallsiteContextGraph::updateAllocationCall(CallInfo &Call,
2748 AllocationType AllocType) {
2749 auto *AI = Call.call().dyn_cast<AllocInfo *>();
2750 assert(AI);
2751 assert(AI->Versions.size() > Call.cloneNo());
2752 AI->Versions[Call.cloneNo()] = (uint8_t)AllocType;
2753}
2754
2755void ModuleCallsiteContextGraph::updateCall(CallInfo &CallerCall,
2756 FuncInfo CalleeFunc) {
2757 if (CalleeFunc.cloneNo() > 0)
2758 cast<CallBase>(Val: CallerCall.call())->setCalledFunction(CalleeFunc.func());
2759 OREGetter(CallerCall.call()->getFunction())
2760 .emit(OptDiag&: OptimizationRemark(DEBUG_TYPE, "MemprofCall", CallerCall.call())
2761 << ore::NV("Call", CallerCall.call()) << " in clone "
2762 << ore::NV("Caller", CallerCall.call()->getFunction())
2763 << " assigned to call function clone "
2764 << ore::NV("Callee", CalleeFunc.func()));
2765}
2766
2767void IndexCallsiteContextGraph::updateCall(CallInfo &CallerCall,
2768 FuncInfo CalleeFunc) {
2769 auto *CI = CallerCall.call().dyn_cast<CallsiteInfo *>();
2770 assert(CI &&
2771 "Caller cannot be an allocation which should not have profiled calls");
2772 assert(CI->Clones.size() > CallerCall.cloneNo());
2773 CI->Clones[CallerCall.cloneNo()] = CalleeFunc.cloneNo();
2774}
2775
2776CallsiteContextGraph<ModuleCallsiteContextGraph, Function,
2777 Instruction *>::FuncInfo
2778ModuleCallsiteContextGraph::cloneFunctionForCallsite(
2779 FuncInfo &Func, CallInfo &Call, std::map<CallInfo, CallInfo> &CallMap,
2780 std::vector<CallInfo> &CallsWithMetadataInFunc, unsigned CloneNo) {
2781 // Use existing LLVM facilities for cloning and obtaining Call in clone
2782 ValueToValueMapTy VMap;
2783 auto *NewFunc = CloneFunction(F: Func.func(), VMap);
2784 std::string Name = getMemProfFuncName(Base: Func.func()->getName(), CloneNo);
2785 assert(!Func.func()->getParent()->getFunction(Name));
2786 NewFunc->setName(Name);
2787 for (auto &Inst : CallsWithMetadataInFunc) {
2788 // This map always has the initial version in it.
2789 assert(Inst.cloneNo() == 0);
2790 CallMap[Inst] = {cast<Instruction>(Val&: VMap[Inst.call()]), CloneNo};
2791 }
2792 OREGetter(Func.func())
2793 .emit(OptDiag&: OptimizationRemark(DEBUG_TYPE, "MemprofClone", Func.func())
2794 << "created clone " << ore::NV("NewFunction", NewFunc));
2795 return {NewFunc, CloneNo};
2796}
2797
2798CallsiteContextGraph<IndexCallsiteContextGraph, FunctionSummary,
2799 IndexCall>::FuncInfo
2800IndexCallsiteContextGraph::cloneFunctionForCallsite(
2801 FuncInfo &Func, CallInfo &Call, std::map<CallInfo, CallInfo> &CallMap,
2802 std::vector<CallInfo> &CallsWithMetadataInFunc, unsigned CloneNo) {
2803 // Check how many clones we have of Call (and therefore function).
2804 // The next clone number is the current size of versions array.
2805 // Confirm this matches the CloneNo provided by the caller, which is based on
2806 // the number of function clones we have.
2807 assert(CloneNo ==
2808 (Call.call().is<AllocInfo *>()
2809 ? Call.call().dyn_cast<AllocInfo *>()->Versions.size()
2810 : Call.call().dyn_cast<CallsiteInfo *>()->Clones.size()));
2811 // Walk all the instructions in this function. Create a new version for
2812 // each (by adding an entry to the Versions/Clones summary array), and copy
2813 // over the version being called for the function clone being cloned here.
2814 // Additionally, add an entry to the CallMap for the new function clone,
2815 // mapping the original call (clone 0, what is in CallsWithMetadataInFunc)
2816 // to the new call clone.
2817 for (auto &Inst : CallsWithMetadataInFunc) {
2818 // This map always has the initial version in it.
2819 assert(Inst.cloneNo() == 0);
2820 if (auto *AI = Inst.call().dyn_cast<AllocInfo *>()) {
2821 assert(AI->Versions.size() == CloneNo);
2822 // We assign the allocation type later (in updateAllocationCall), just add
2823 // an entry for it here.
2824 AI->Versions.push_back(Elt: 0);
2825 } else {
2826 auto *CI = Inst.call().dyn_cast<CallsiteInfo *>();
2827 assert(CI && CI->Clones.size() == CloneNo);
2828 // We assign the clone number later (in updateCall), just add an entry for
2829 // it here.
2830 CI->Clones.push_back(Elt: 0);
2831 }
2832 CallMap[Inst] = {Inst.call(), CloneNo};
2833 }
2834 return {Func.func(), CloneNo};
2835}
2836
2837// This method assigns cloned callsites to functions, cloning the functions as
2838// needed. The assignment is greedy and proceeds roughly as follows:
2839//
2840// For each function Func:
2841// For each call with graph Node having clones:
2842// Initialize ClonesWorklist to Node and its clones
2843// Initialize NodeCloneCount to 0
2844// While ClonesWorklist is not empty:
2845// Clone = pop front ClonesWorklist
2846// NodeCloneCount++
2847// If Func has been cloned less than NodeCloneCount times:
2848// If NodeCloneCount is 1:
2849// Assign Clone to original Func
2850// Continue
2851// Create a new function clone
2852// If other callers not assigned to call a function clone yet:
2853// Assign them to call new function clone
2854// Continue
2855// Assign any other caller calling the cloned version to new clone
2856//
2857// For each caller of Clone:
2858// If caller is assigned to call a specific function clone:
2859// If we cannot assign Clone to that function clone:
2860// Create new callsite Clone NewClone
2861// Add NewClone to ClonesWorklist
2862// Continue
2863// Assign Clone to existing caller's called function clone
2864// Else:
2865// If Clone not already assigned to a function clone:
2866// Assign to first function clone without assignment
2867// Assign caller to selected function clone
2868template <typename DerivedCCG, typename FuncTy, typename CallTy>
2869bool CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::assignFunctions() {
2870 bool Changed = false;
2871
2872 // Keep track of the assignment of nodes (callsites) to function clones they
2873 // call.
2874 DenseMap<ContextNode *, FuncInfo> CallsiteToCalleeFuncCloneMap;
2875
2876 // Update caller node to call function version CalleeFunc, by recording the
2877 // assignment in CallsiteToCalleeFuncCloneMap.
2878 auto RecordCalleeFuncOfCallsite = [&](ContextNode *Caller,
2879 const FuncInfo &CalleeFunc) {
2880 assert(Caller->hasCall());
2881 CallsiteToCalleeFuncCloneMap[Caller] = CalleeFunc;
2882 };
2883
2884 // Walk all functions for which we saw calls with memprof metadata, and handle
2885 // cloning for each of its calls.
2886 for (auto &[Func, CallsWithMetadata] : FuncToCallsWithMetadata) {
2887 FuncInfo OrigFunc(Func);
2888 // Map from each clone of OrigFunc to a map of remappings of each call of
2889 // interest (from original uncloned call to the corresponding cloned call in
2890 // that function clone).
2891 std::map<FuncInfo, std::map<CallInfo, CallInfo>> FuncClonesToCallMap;
2892 for (auto &Call : CallsWithMetadata) {
2893 ContextNode *Node = getNodeForInst(C: Call);
2894 // Skip call if we do not have a node for it (all uses of its stack ids
2895 // were either on inlined chains or pruned from the MIBs), or if we did
2896 // not create any clones for it.
2897 if (!Node || Node->Clones.empty())
2898 continue;
2899 assert(Node->hasCall() &&
2900 "Not having a call should have prevented cloning");
2901
2902 // Track the assignment of function clones to clones of the current
2903 // callsite Node being handled.
2904 std::map<FuncInfo, ContextNode *> FuncCloneToCurNodeCloneMap;
2905
2906 // Assign callsite version CallsiteClone to function version FuncClone,
2907 // and also assign (possibly cloned) Call to CallsiteClone.
2908 auto AssignCallsiteCloneToFuncClone = [&](const FuncInfo &FuncClone,
2909 CallInfo &Call,
2910 ContextNode *CallsiteClone,
2911 bool IsAlloc) {
2912 // Record the clone of callsite node assigned to this function clone.
2913 FuncCloneToCurNodeCloneMap[FuncClone] = CallsiteClone;
2914
2915 assert(FuncClonesToCallMap.count(FuncClone));
2916 std::map<CallInfo, CallInfo> &CallMap = FuncClonesToCallMap[FuncClone];
2917 CallInfo CallClone(Call);
2918 if (CallMap.count(Call))
2919 CallClone = CallMap[Call];
2920 CallsiteClone->setCall(CallClone);
2921 };
2922
2923 // Keep track of the clones of callsite Node that need to be assigned to
2924 // function clones. This list may be expanded in the loop body below if we
2925 // find additional cloning is required.
2926 std::deque<ContextNode *> ClonesWorklist;
2927 // Ignore original Node if we moved all of its contexts to clones.
2928 if (!Node->ContextIds.empty())
2929 ClonesWorklist.push_back(Node);
2930 ClonesWorklist.insert(ClonesWorklist.end(), Node->Clones.begin(),
2931 Node->Clones.end());
2932
2933 // Now walk through all of the clones of this callsite Node that we need,
2934 // and determine the assignment to a corresponding clone of the current
2935 // function (creating new function clones as needed).
2936 unsigned NodeCloneCount = 0;
2937 while (!ClonesWorklist.empty()) {
2938 ContextNode *Clone = ClonesWorklist.front();
2939 ClonesWorklist.pop_front();
2940 NodeCloneCount++;
2941 if (VerifyNodes)
2942 checkNode<DerivedCCG, FuncTy, CallTy>(Clone);
2943
2944 // Need to create a new function clone if we have more callsite clones
2945 // than existing function clones, which would have been assigned to an
2946 // earlier clone in the list (we assign callsite clones to function
2947 // clones greedily).
2948 if (FuncClonesToCallMap.size() < NodeCloneCount) {
2949 // If this is the first callsite copy, assign to original function.
2950 if (NodeCloneCount == 1) {
2951 // Since FuncClonesToCallMap is empty in this case, no clones have
2952 // been created for this function yet, and no callers should have
2953 // been assigned a function clone for this callee node yet.
2954 assert(llvm::none_of(
2955 Clone->CallerEdges, [&](const std::shared_ptr<ContextEdge> &E) {
2956 return CallsiteToCalleeFuncCloneMap.count(E->Caller);
2957 }));
2958 // Initialize with empty call map, assign Clone to original function
2959 // and its callers, and skip to the next clone.
2960 FuncClonesToCallMap[OrigFunc] = {};
2961 AssignCallsiteCloneToFuncClone(
2962 OrigFunc, Call, Clone,
2963 AllocationCallToContextNodeMap.count(Call));
2964 for (auto &CE : Clone->CallerEdges) {
2965 // Ignore any caller that does not have a recorded callsite Call.
2966 if (!CE->Caller->hasCall())
2967 continue;
2968 RecordCalleeFuncOfCallsite(CE->Caller, OrigFunc);
2969 }
2970 continue;
2971 }
2972
2973 // First locate which copy of OrigFunc to clone again. If a caller
2974 // of this callsite clone was already assigned to call a particular
2975 // function clone, we need to redirect all of those callers to the
2976 // new function clone, and update their other callees within this
2977 // function.
2978 FuncInfo PreviousAssignedFuncClone;
2979 auto EI = llvm::find_if(
2980 Clone->CallerEdges, [&](const std::shared_ptr<ContextEdge> &E) {
2981 return CallsiteToCalleeFuncCloneMap.count(E->Caller);
2982 });
2983 bool CallerAssignedToCloneOfFunc = false;
2984 if (EI != Clone->CallerEdges.end()) {
2985 const std::shared_ptr<ContextEdge> &Edge = *EI;
2986 PreviousAssignedFuncClone =
2987 CallsiteToCalleeFuncCloneMap[Edge->Caller];
2988 CallerAssignedToCloneOfFunc = true;
2989 }
2990
2991 // Clone function and save it along with the CallInfo map created
2992 // during cloning in the FuncClonesToCallMap.
2993 std::map<CallInfo, CallInfo> NewCallMap;
2994 unsigned CloneNo = FuncClonesToCallMap.size();
2995 assert(CloneNo > 0 && "Clone 0 is the original function, which "
2996 "should already exist in the map");
2997 FuncInfo NewFuncClone = cloneFunctionForCallsite(
2998 Func&: OrigFunc, Call, CallMap&: NewCallMap, CallsWithMetadataInFunc&: CallsWithMetadata, CloneNo);
2999 FuncClonesToCallMap.emplace(NewFuncClone, std::move(NewCallMap));
3000 FunctionClonesAnalysis++;
3001 Changed = true;
3002
3003 // If no caller callsites were already assigned to a clone of this
3004 // function, we can simply assign this clone to the new func clone
3005 // and update all callers to it, then skip to the next clone.
3006 if (!CallerAssignedToCloneOfFunc) {
3007 AssignCallsiteCloneToFuncClone(
3008 NewFuncClone, Call, Clone,
3009 AllocationCallToContextNodeMap.count(Call));
3010 for (auto &CE : Clone->CallerEdges) {
3011 // Ignore any caller that does not have a recorded callsite Call.
3012 if (!CE->Caller->hasCall())
3013 continue;
3014 RecordCalleeFuncOfCallsite(CE->Caller, NewFuncClone);
3015 }
3016 continue;
3017 }
3018
3019 // We may need to do additional node cloning in this case.
3020 // Reset the CallsiteToCalleeFuncCloneMap entry for any callers
3021 // that were previously assigned to call PreviousAssignedFuncClone,
3022 // to record that they now call NewFuncClone.
3023 for (auto CE : Clone->CallerEdges) {
3024 // Skip any that have been removed on an earlier iteration.
3025 if (!CE)
3026 continue;
3027 // Ignore any caller that does not have a recorded callsite Call.
3028 if (!CE->Caller->hasCall())
3029 continue;
3030
3031 if (!CallsiteToCalleeFuncCloneMap.count(CE->Caller) ||
3032 // We subsequently fall through to later handling that
3033 // will perform any additional cloning required for
3034 // callers that were calling other function clones.
3035 CallsiteToCalleeFuncCloneMap[CE->Caller] !=
3036 PreviousAssignedFuncClone)
3037 continue;
3038
3039 RecordCalleeFuncOfCallsite(CE->Caller, NewFuncClone);
3040
3041 // If we are cloning a function that was already assigned to some
3042 // callers, then essentially we are creating new callsite clones
3043 // of the other callsites in that function that are reached by those
3044 // callers. Clone the other callees of the current callsite's caller
3045 // that were already assigned to PreviousAssignedFuncClone
3046 // accordingly. This is important since we subsequently update the
3047 // calls from the nodes in the graph and their assignments to callee
3048 // functions recorded in CallsiteToCalleeFuncCloneMap.
3049 for (auto CalleeEdge : CE->Caller->CalleeEdges) {
3050 // Skip any that have been removed on an earlier iteration when
3051 // cleaning up newly None type callee edges.
3052 if (!CalleeEdge)
3053 continue;
3054 ContextNode *Callee = CalleeEdge->Callee;
3055 // Skip the current callsite, we are looking for other
3056 // callsites Caller calls, as well as any that does not have a
3057 // recorded callsite Call.
3058 if (Callee == Clone || !Callee->hasCall())
3059 continue;
3060 ContextNode *NewClone = moveEdgeToNewCalleeClone(Edge: CalleeEdge);
3061 removeNoneTypeCalleeEdges(Node: NewClone);
3062 // Moving the edge may have resulted in some none type
3063 // callee edges on the original Callee.
3064 removeNoneTypeCalleeEdges(Node: Callee);
3065 assert(NewClone->AllocTypes != (uint8_t)AllocationType::None);
3066 // If the Callee node was already assigned to call a specific
3067 // function version, make sure its new clone is assigned to call
3068 // that same function clone.
3069 if (CallsiteToCalleeFuncCloneMap.count(Callee))
3070 RecordCalleeFuncOfCallsite(
3071 NewClone, CallsiteToCalleeFuncCloneMap[Callee]);
3072 // Update NewClone with the new Call clone of this callsite's Call
3073 // created for the new function clone created earlier.
3074 // Recall that we have already ensured when building the graph
3075 // that each caller can only call callsites within the same
3076 // function, so we are guaranteed that Callee Call is in the
3077 // current OrigFunc.
3078 // CallMap is set up as indexed by original Call at clone 0.
3079 CallInfo OrigCall(Callee->getOrigNode()->Call);
3080 OrigCall.setCloneNo(0);
3081 std::map<CallInfo, CallInfo> &CallMap =
3082 FuncClonesToCallMap[NewFuncClone];
3083 assert(CallMap.count(OrigCall));
3084 CallInfo NewCall(CallMap[OrigCall]);
3085 assert(NewCall);
3086 NewClone->setCall(NewCall);
3087 }
3088 }
3089 // Fall through to handling below to perform the recording of the
3090 // function for this callsite clone. This enables handling of cases
3091 // where the callers were assigned to different clones of a function.
3092 }
3093
3094 // See if we can use existing function clone. Walk through
3095 // all caller edges to see if any have already been assigned to
3096 // a clone of this callsite's function. If we can use it, do so. If not,
3097 // because that function clone is already assigned to a different clone
3098 // of this callsite, then we need to clone again.
3099 // Basically, this checking is needed to handle the case where different
3100 // caller functions/callsites may need versions of this function
3101 // containing different mixes of callsite clones across the different
3102 // callsites within the function. If that happens, we need to create
3103 // additional function clones to handle the various combinations.
3104 //
3105 // Keep track of any new clones of this callsite created by the
3106 // following loop, as well as any existing clone that we decided to
3107 // assign this clone to.
3108 std::map<FuncInfo, ContextNode *> FuncCloneToNewCallsiteCloneMap;
3109 FuncInfo FuncCloneAssignedToCurCallsiteClone;
3110 // We need to be able to remove Edge from CallerEdges, so need to adjust
3111 // iterator in the loop.
3112 for (auto EI = Clone->CallerEdges.begin();
3113 EI != Clone->CallerEdges.end();) {
3114 auto Edge = *EI;
3115 // Ignore any caller that does not have a recorded callsite Call.
3116 if (!Edge->Caller->hasCall()) {
3117 EI++;
3118 continue;
3119 }
3120 // If this caller already assigned to call a version of OrigFunc, need
3121 // to ensure we can assign this callsite clone to that function clone.
3122 if (CallsiteToCalleeFuncCloneMap.count(Edge->Caller)) {
3123 FuncInfo FuncCloneCalledByCaller =
3124 CallsiteToCalleeFuncCloneMap[Edge->Caller];
3125 // First we need to confirm that this function clone is available
3126 // for use by this callsite node clone.
3127 //
3128 // While FuncCloneToCurNodeCloneMap is built only for this Node and
3129 // its callsite clones, one of those callsite clones X could have
3130 // been assigned to the same function clone called by Edge's caller
3131 // - if Edge's caller calls another callsite within Node's original
3132 // function, and that callsite has another caller reaching clone X.
3133 // We need to clone Node again in this case.
3134 if ((FuncCloneToCurNodeCloneMap.count(FuncCloneCalledByCaller) &&
3135 FuncCloneToCurNodeCloneMap[FuncCloneCalledByCaller] !=
3136 Clone) ||
3137 // Detect when we have multiple callers of this callsite that
3138 // have already been assigned to specific, and different, clones
3139 // of OrigFunc (due to other unrelated callsites in Func they
3140 // reach via call contexts). Is this Clone of callsite Node
3141 // assigned to a different clone of OrigFunc? If so, clone Node
3142 // again.
3143 (FuncCloneAssignedToCurCallsiteClone &&
3144 FuncCloneAssignedToCurCallsiteClone !=
3145 FuncCloneCalledByCaller)) {
3146 // We need to use a different newly created callsite clone, in
3147 // order to assign it to another new function clone on a
3148 // subsequent iteration over the Clones array (adjusted below).
3149 // Note we specifically do not reset the
3150 // CallsiteToCalleeFuncCloneMap entry for this caller, so that
3151 // when this new clone is processed later we know which version of
3152 // the function to copy (so that other callsite clones we have
3153 // assigned to that function clone are properly cloned over). See
3154 // comments in the function cloning handling earlier.
3155
3156 // Check if we already have cloned this callsite again while
3157 // walking through caller edges, for a caller calling the same
3158 // function clone. If so, we can move this edge to that new clone
3159 // rather than creating yet another new clone.
3160 if (FuncCloneToNewCallsiteCloneMap.count(
3161 FuncCloneCalledByCaller)) {
3162 ContextNode *NewClone =
3163 FuncCloneToNewCallsiteCloneMap[FuncCloneCalledByCaller];
3164 moveEdgeToExistingCalleeClone(Edge, NewCallee: NewClone, CallerEdgeI: &EI);
3165 // Cleanup any none type edges cloned over.
3166 removeNoneTypeCalleeEdges(Node: NewClone);
3167 } else {
3168 // Create a new callsite clone.
3169 ContextNode *NewClone = moveEdgeToNewCalleeClone(Edge, CallerEdgeI: &EI);
3170 removeNoneTypeCalleeEdges(Node: NewClone);
3171 FuncCloneToNewCallsiteCloneMap[FuncCloneCalledByCaller] =
3172 NewClone;
3173 // Add to list of clones and process later.
3174 ClonesWorklist.push_back(NewClone);
3175 assert(EI == Clone->CallerEdges.end() ||
3176 Clone->AllocTypes != (uint8_t)AllocationType::None);
3177 assert(NewClone->AllocTypes != (uint8_t)AllocationType::None);
3178 }
3179 // Moving the caller edge may have resulted in some none type
3180 // callee edges.
3181 removeNoneTypeCalleeEdges(Node: Clone);
3182 // We will handle the newly created callsite clone in a subsequent
3183 // iteration over this Node's Clones. Continue here since we
3184 // already adjusted iterator EI while moving the edge.
3185 continue;
3186 }
3187
3188 // Otherwise, we can use the function clone already assigned to this
3189 // caller.
3190 if (!FuncCloneAssignedToCurCallsiteClone) {
3191 FuncCloneAssignedToCurCallsiteClone = FuncCloneCalledByCaller;
3192 // Assign Clone to FuncCloneCalledByCaller
3193 AssignCallsiteCloneToFuncClone(
3194 FuncCloneCalledByCaller, Call, Clone,
3195 AllocationCallToContextNodeMap.count(Call));
3196 } else
3197 // Don't need to do anything - callsite is already calling this
3198 // function clone.
3199 assert(FuncCloneAssignedToCurCallsiteClone ==
3200 FuncCloneCalledByCaller);
3201
3202 } else {
3203 // We have not already assigned this caller to a version of
3204 // OrigFunc. Do the assignment now.
3205
3206 // First check if we have already assigned this callsite clone to a
3207 // clone of OrigFunc for another caller during this iteration over
3208 // its caller edges.
3209 if (!FuncCloneAssignedToCurCallsiteClone) {
3210 // Find first function in FuncClonesToCallMap without an assigned
3211 // clone of this callsite Node. We should always have one
3212 // available at this point due to the earlier cloning when the
3213 // FuncClonesToCallMap size was smaller than the clone number.
3214 for (auto &CF : FuncClonesToCallMap) {
3215 if (!FuncCloneToCurNodeCloneMap.count(CF.first)) {
3216 FuncCloneAssignedToCurCallsiteClone = CF.first;
3217 break;
3218 }
3219 }
3220 assert(FuncCloneAssignedToCurCallsiteClone);
3221 // Assign Clone to FuncCloneAssignedToCurCallsiteClone
3222 AssignCallsiteCloneToFuncClone(
3223 FuncCloneAssignedToCurCallsiteClone, Call, Clone,
3224 AllocationCallToContextNodeMap.count(Call));
3225 } else
3226 assert(FuncCloneToCurNodeCloneMap
3227 [FuncCloneAssignedToCurCallsiteClone] == Clone);
3228 // Update callers to record function version called.
3229 RecordCalleeFuncOfCallsite(Edge->Caller,
3230 FuncCloneAssignedToCurCallsiteClone);
3231 }
3232
3233 EI++;
3234 }
3235 }
3236 if (VerifyCCG) {
3237 checkNode<DerivedCCG, FuncTy, CallTy>(Node);
3238 for (const auto &PE : Node->CalleeEdges)
3239 checkNode<DerivedCCG, FuncTy, CallTy>(PE->Callee);
3240 for (const auto &CE : Node->CallerEdges)
3241 checkNode<DerivedCCG, FuncTy, CallTy>(CE->Caller);
3242 for (auto *Clone : Node->Clones) {
3243 checkNode<DerivedCCG, FuncTy, CallTy>(Clone);
3244 for (const auto &PE : Clone->CalleeEdges)
3245 checkNode<DerivedCCG, FuncTy, CallTy>(PE->Callee);
3246 for (const auto &CE : Clone->CallerEdges)
3247 checkNode<DerivedCCG, FuncTy, CallTy>(CE->Caller);
3248 }
3249 }
3250 }
3251 }
3252
3253 auto UpdateCalls = [&](ContextNode *Node,
3254 DenseSet<const ContextNode *> &Visited,
3255 auto &&UpdateCalls) {
3256 auto Inserted = Visited.insert(Node);
3257 if (!Inserted.second)
3258 return;
3259
3260 for (auto *Clone : Node->Clones)
3261 UpdateCalls(Clone, Visited, UpdateCalls);
3262
3263 for (auto &Edge : Node->CallerEdges)
3264 UpdateCalls(Edge->Caller, Visited, UpdateCalls);
3265
3266 // Skip if either no call to update, or if we ended up with no context ids
3267 // (we moved all edges onto other clones).
3268 if (!Node->hasCall() || Node->ContextIds.empty())
3269 return;
3270
3271 if (Node->IsAllocation) {
3272 updateAllocationCall(Call&: Node->Call, AllocType: allocTypeToUse(Node->AllocTypes));
3273 return;
3274 }
3275
3276 if (!CallsiteToCalleeFuncCloneMap.count(Node))
3277 return;
3278
3279 auto CalleeFunc = CallsiteToCalleeFuncCloneMap[Node];
3280 updateCall(CallerCall&: Node->Call, CalleeFunc);
3281 };
3282
3283 // Performs DFS traversal starting from allocation nodes to update calls to
3284 // reflect cloning decisions recorded earlier. For regular LTO this will
3285 // update the actual calls in the IR to call the appropriate function clone
3286 // (and add attributes to allocation calls), whereas for ThinLTO the decisions
3287 // are recorded in the summary entries.
3288 DenseSet<const ContextNode *> Visited;
3289 for (auto &Entry : AllocationCallToContextNodeMap)
3290 UpdateCalls(Entry.second, Visited, UpdateCalls);
3291
3292 return Changed;
3293}
3294
3295static SmallVector<std::unique_ptr<ValueToValueMapTy>, 4> createFunctionClones(
3296 Function &F, unsigned NumClones, Module &M, OptimizationRemarkEmitter &ORE,
3297 std::map<const Function *, SmallPtrSet<const GlobalAlias *, 1>>
3298 &FuncToAliasMap) {
3299 // The first "clone" is the original copy, we should only call this if we
3300 // needed to create new clones.
3301 assert(NumClones > 1);
3302 SmallVector<std::unique_ptr<ValueToValueMapTy>, 4> VMaps;
3303 VMaps.reserve(N: NumClones - 1);
3304 FunctionsClonedThinBackend++;
3305 for (unsigned I = 1; I < NumClones; I++) {
3306 VMaps.emplace_back(Args: std::make_unique<ValueToValueMapTy>());
3307 auto *NewF = CloneFunction(F: &F, VMap&: *VMaps.back());
3308 FunctionClonesThinBackend++;
3309 // Strip memprof and callsite metadata from clone as they are no longer
3310 // needed.
3311 for (auto &BB : *NewF) {
3312 for (auto &Inst : BB) {
3313 Inst.setMetadata(KindID: LLVMContext::MD_memprof, Node: nullptr);
3314 Inst.setMetadata(KindID: LLVMContext::MD_callsite, Node: nullptr);
3315 }
3316 }
3317 std::string Name = getMemProfFuncName(Base: F.getName(), CloneNo: I);
3318 auto *PrevF = M.getFunction(Name);
3319 if (PrevF) {
3320 // We might have created this when adjusting callsite in another
3321 // function. It should be a declaration.
3322 assert(PrevF->isDeclaration());
3323 NewF->takeName(V: PrevF);
3324 PrevF->replaceAllUsesWith(V: NewF);
3325 PrevF->eraseFromParent();
3326 } else
3327 NewF->setName(Name);
3328 ORE.emit(OptDiag&: OptimizationRemark(DEBUG_TYPE, "MemprofClone", &F)
3329 << "created clone " << ore::NV("NewFunction", NewF));
3330
3331 // Now handle aliases to this function, and clone those as well.
3332 if (!FuncToAliasMap.count(x: &F))
3333 continue;
3334 for (auto *A : FuncToAliasMap[&F]) {
3335 std::string Name = getMemProfFuncName(Base: A->getName(), CloneNo: I);
3336 auto *PrevA = M.getNamedAlias(Name);
3337 auto *NewA = GlobalAlias::create(Ty: A->getValueType(),
3338 AddressSpace: A->getType()->getPointerAddressSpace(),
3339 Linkage: A->getLinkage(), Name, Aliasee: NewF);
3340 NewA->copyAttributesFrom(Src: A);
3341 if (PrevA) {
3342 // We might have created this when adjusting callsite in another
3343 // function. It should be a declaration.
3344 assert(PrevA->isDeclaration());
3345 NewA->takeName(V: PrevA);
3346 PrevA->replaceAllUsesWith(V: NewA);
3347 PrevA->eraseFromParent();
3348 }
3349 }
3350 }
3351 return VMaps;
3352}
3353
3354// Locate the summary for F. This is complicated by the fact that it might
3355// have been internalized or promoted.
3356static ValueInfo findValueInfoForFunc(const Function &F, const Module &M,
3357 const ModuleSummaryIndex *ImportSummary) {
3358 // FIXME: Ideally we would retain the original GUID in some fashion on the
3359 // function (e.g. as metadata), but for now do our best to locate the
3360 // summary without that information.
3361 ValueInfo TheFnVI = ImportSummary->getValueInfo(GUID: F.getGUID());
3362 if (!TheFnVI)
3363 // See if theFn was internalized, by checking index directly with
3364 // original name (this avoids the name adjustment done by getGUID() for
3365 // internal symbols).
3366 TheFnVI = ImportSummary->getValueInfo(GUID: GlobalValue::getGUID(GlobalName: F.getName()));
3367 if (TheFnVI)
3368 return TheFnVI;
3369 // Now query with the original name before any promotion was performed.
3370 StringRef OrigName =
3371 ModuleSummaryIndex::getOriginalNameBeforePromote(Name: F.getName());
3372 std::string OrigId = GlobalValue::getGlobalIdentifier(
3373 Name: OrigName, Linkage: GlobalValue::InternalLinkage, FileName: M.getSourceFileName());
3374 TheFnVI = ImportSummary->getValueInfo(GUID: GlobalValue::getGUID(GlobalName: OrigId));
3375 if (TheFnVI)
3376 return TheFnVI;
3377 // Could be a promoted local imported from another module. We need to pass
3378 // down more info here to find the original module id. For now, try with
3379 // the OrigName which might have been stored in the OidGuidMap in the
3380 // index. This would not work if there were same-named locals in multiple
3381 // modules, however.
3382 auto OrigGUID =
3383 ImportSummary->getGUIDFromOriginalID(OriginalID: GlobalValue::getGUID(GlobalName: OrigName));
3384 if (OrigGUID)
3385 TheFnVI = ImportSummary->getValueInfo(GUID: OrigGUID);
3386 return TheFnVI;
3387}
3388
3389bool MemProfContextDisambiguation::applyImport(Module &M) {
3390 assert(ImportSummary);
3391 bool Changed = false;
3392
3393 auto IsMemProfClone = [](const Function &F) {
3394 return F.getName().contains(Other: MemProfCloneSuffix);
3395 };
3396
3397 // We also need to clone any aliases that reference cloned functions, because
3398 // the modified callsites may invoke via the alias. Keep track of the aliases
3399 // for each function.
3400 std::map<const Function *, SmallPtrSet<const GlobalAlias *, 1>>
3401 FuncToAliasMap;
3402 for (auto &A : M.aliases()) {
3403 auto *Aliasee = A.getAliaseeObject();
3404 if (auto *F = dyn_cast<Function>(Val: Aliasee))
3405 FuncToAliasMap[F].insert(Ptr: &A);
3406 }
3407
3408 for (auto &F : M) {
3409 if (F.isDeclaration() || IsMemProfClone(F))
3410 continue;
3411
3412 OptimizationRemarkEmitter ORE(&F);
3413
3414 SmallVector<std::unique_ptr<ValueToValueMapTy>, 4> VMaps;
3415 bool ClonesCreated = false;
3416 unsigned NumClonesCreated = 0;
3417 auto CloneFuncIfNeeded = [&](unsigned NumClones) {
3418 // We should at least have version 0 which is the original copy.
3419 assert(NumClones > 0);
3420 // If only one copy needed use original.
3421 if (NumClones == 1)
3422 return;
3423 // If we already performed cloning of this function, confirm that the
3424 // requested number of clones matches (the thin link should ensure the
3425 // number of clones for each constituent callsite is consistent within
3426 // each function), before returning.
3427 if (ClonesCreated) {
3428 assert(NumClonesCreated == NumClones);
3429 return;
3430 }
3431 VMaps = createFunctionClones(F, NumClones, M, ORE, FuncToAliasMap);
3432 // The first "clone" is the original copy, which doesn't have a VMap.
3433 assert(VMaps.size() == NumClones - 1);
3434 Changed = true;
3435 ClonesCreated = true;
3436 NumClonesCreated = NumClones;
3437 };
3438
3439 auto CloneCallsite = [&](const CallsiteInfo &StackNode, CallBase *CB,
3440 Function *CalledFunction) {
3441 // Perform cloning if not yet done.
3442 CloneFuncIfNeeded(/*NumClones=*/StackNode.Clones.size());
3443
3444 // Should have skipped indirect calls via mayHaveMemprofSummary.
3445 assert(CalledFunction);
3446 assert(!IsMemProfClone(*CalledFunction));
3447
3448 // Update the calls per the summary info.
3449 // Save orig name since it gets updated in the first iteration
3450 // below.
3451 auto CalleeOrigName = CalledFunction->getName();
3452 for (unsigned J = 0; J < StackNode.Clones.size(); J++) {
3453 // Do nothing if this version calls the original version of its
3454 // callee.
3455 if (!StackNode.Clones[J])
3456 continue;
3457 auto NewF = M.getOrInsertFunction(
3458 Name: getMemProfFuncName(Base: CalleeOrigName, CloneNo: StackNode.Clones[J]),
3459 T: CalledFunction->getFunctionType());
3460 CallBase *CBClone;
3461 // Copy 0 is the original function.
3462 if (!J)
3463 CBClone = CB;
3464 else
3465 CBClone = cast<CallBase>(Val&: (*VMaps[J - 1])[CB]);
3466 CBClone->setCalledFunction(NewF);
3467 ORE.emit(OptDiag&: OptimizationRemark(DEBUG_TYPE, "MemprofCall", CBClone)
3468 << ore::NV("Call", CBClone) << " in clone "
3469 << ore::NV("Caller", CBClone->getFunction())
3470 << " assigned to call function clone "
3471 << ore::NV("Callee", NewF.getCallee()));
3472 }
3473 };
3474
3475 // Locate the summary for F.
3476 ValueInfo TheFnVI = findValueInfoForFunc(F, M, ImportSummary);
3477 // If not found, this could be an imported local (see comment in
3478 // findValueInfoForFunc). Skip for now as it will be cloned in its original
3479 // module (where it would have been promoted to global scope so should
3480 // satisfy any reference in this module).
3481 if (!TheFnVI)
3482 continue;
3483
3484 auto *GVSummary =
3485 ImportSummary->findSummaryInModule(VI: TheFnVI, ModuleId: M.getModuleIdentifier());
3486 if (!GVSummary) {
3487 // Must have been imported, use the summary which matches the definition。
3488 // (might be multiple if this was a linkonce_odr).
3489 auto SrcModuleMD = F.getMetadata(Kind: "thinlto_src_module");
3490 assert(SrcModuleMD &&
3491 "enable-import-metadata is needed to emit thinlto_src_module");
3492 StringRef SrcModule =
3493 dyn_cast<MDString>(Val: SrcModuleMD->getOperand(I: 0))->getString();
3494 for (auto &GVS : TheFnVI.getSummaryList()) {
3495 if (GVS->modulePath() == SrcModule) {
3496 GVSummary = GVS.get();
3497 break;
3498 }
3499 }
3500 assert(GVSummary && GVSummary->modulePath() == SrcModule);
3501 }
3502
3503 // If this was an imported alias skip it as we won't have the function
3504 // summary, and it should be cloned in the original module.
3505 if (isa<AliasSummary>(Val: GVSummary))
3506 continue;
3507
3508 auto *FS = cast<FunctionSummary>(Val: GVSummary->getBaseObject());
3509
3510 if (FS->allocs().empty() && FS->callsites().empty())
3511 continue;
3512
3513 auto SI = FS->callsites().begin();
3514 auto AI = FS->allocs().begin();
3515
3516 // To handle callsite infos synthesized for tail calls which have missing
3517 // frames in the profiled context, map callee VI to the synthesized callsite
3518 // info.
3519 DenseMap<ValueInfo, CallsiteInfo> MapTailCallCalleeVIToCallsite;
3520 // Iterate the callsites for this function in reverse, since we place all
3521 // those synthesized for tail calls at the end.
3522 for (auto CallsiteIt = FS->callsites().rbegin();
3523 CallsiteIt != FS->callsites().rend(); CallsiteIt++) {
3524 auto &Callsite = *CallsiteIt;
3525 // Stop as soon as we see a non-synthesized callsite info (see comment
3526 // above loop). All the entries added for discovered tail calls have empty
3527 // stack ids.
3528 if (!Callsite.StackIdIndices.empty())
3529 break;
3530 MapTailCallCalleeVIToCallsite.insert(KV: {Callsite.Callee, Callsite});
3531 }
3532
3533 // Assume for now that the instructions are in the exact same order
3534 // as when the summary was created, but confirm this is correct by
3535 // matching the stack ids.
3536 for (auto &BB : F) {
3537 for (auto &I : BB) {
3538 auto *CB = dyn_cast<CallBase>(Val: &I);
3539 // Same handling as when creating module summary.
3540 if (!mayHaveMemprofSummary(CB))
3541 continue;
3542
3543 auto *CalledValue = CB->getCalledOperand();
3544 auto *CalledFunction = CB->getCalledFunction();
3545 if (CalledValue && !CalledFunction) {
3546 CalledValue = CalledValue->stripPointerCasts();
3547 // Stripping pointer casts can reveal a called function.
3548 CalledFunction = dyn_cast<Function>(Val: CalledValue);
3549 }
3550 // Check if this is an alias to a function. If so, get the
3551 // called aliasee for the checks below.
3552 if (auto *GA = dyn_cast<GlobalAlias>(Val: CalledValue)) {
3553 assert(!CalledFunction &&
3554 "Expected null called function in callsite for alias");
3555 CalledFunction = dyn_cast<Function>(Val: GA->getAliaseeObject());
3556 }
3557
3558 CallStack<MDNode, MDNode::op_iterator> CallsiteContext(
3559 I.getMetadata(KindID: LLVMContext::MD_callsite));
3560 auto *MemProfMD = I.getMetadata(KindID: LLVMContext::MD_memprof);
3561
3562 // Include allocs that were already assigned a memprof function
3563 // attribute in the statistics.
3564 if (CB->getAttributes().hasFnAttr(Kind: "memprof")) {
3565 assert(!MemProfMD);
3566 CB->getAttributes().getFnAttr(Kind: "memprof").getValueAsString() == "cold"
3567 ? AllocTypeColdThinBackend++
3568 : AllocTypeNotColdThinBackend++;
3569 OrigAllocsThinBackend++;
3570 AllocVersionsThinBackend++;
3571 if (!MaxAllocVersionsThinBackend)
3572 MaxAllocVersionsThinBackend = 1;
3573 // Remove any remaining callsite metadata and we can skip the rest of
3574 // the handling for this instruction, since no cloning needed.
3575 I.setMetadata(KindID: LLVMContext::MD_callsite, Node: nullptr);
3576 continue;
3577 }
3578
3579 if (MemProfMD) {
3580 // Consult the next alloc node.
3581 assert(AI != FS->allocs().end());
3582 auto &AllocNode = *(AI++);
3583
3584 // Sanity check that the MIB stack ids match between the summary and
3585 // instruction metadata.
3586 auto MIBIter = AllocNode.MIBs.begin();
3587 for (auto &MDOp : MemProfMD->operands()) {
3588 assert(MIBIter != AllocNode.MIBs.end());
3589 LLVM_ATTRIBUTE_UNUSED auto StackIdIndexIter =
3590 MIBIter->StackIdIndices.begin();
3591 auto *MIBMD = cast<const MDNode>(Val: MDOp);
3592 MDNode *StackMDNode = getMIBStackNode(MIB: MIBMD);
3593 assert(StackMDNode);
3594 CallStack<MDNode, MDNode::op_iterator> StackContext(StackMDNode);
3595 auto ContextIterBegin =
3596 StackContext.beginAfterSharedPrefix(Other&: CallsiteContext);
3597 // Skip the checking on the first iteration.
3598 uint64_t LastStackContextId =
3599 (ContextIterBegin != StackContext.end() &&
3600 *ContextIterBegin == 0)
3601 ? 1
3602 : 0;
3603 for (auto ContextIter = ContextIterBegin;
3604 ContextIter != StackContext.end(); ++ContextIter) {
3605 // If this is a direct recursion, simply skip the duplicate
3606 // entries, to be consistent with how the summary ids were
3607 // generated during ModuleSummaryAnalysis.
3608 if (LastStackContextId == *ContextIter)
3609 continue;
3610 LastStackContextId = *ContextIter;
3611 assert(StackIdIndexIter != MIBIter->StackIdIndices.end());
3612 assert(ImportSummary->getStackIdAtIndex(*StackIdIndexIter) ==
3613 *ContextIter);
3614 StackIdIndexIter++;
3615 }
3616 MIBIter++;
3617 }
3618
3619 // Perform cloning if not yet done.
3620 CloneFuncIfNeeded(/*NumClones=*/AllocNode.Versions.size());
3621
3622 OrigAllocsThinBackend++;
3623 AllocVersionsThinBackend += AllocNode.Versions.size();
3624 if (MaxAllocVersionsThinBackend < AllocNode.Versions.size())
3625 MaxAllocVersionsThinBackend = AllocNode.Versions.size();
3626
3627 // If there is only one version that means we didn't end up
3628 // considering this function for cloning, and in that case the alloc
3629 // will still be none type or should have gotten the default NotCold.
3630 // Skip that after calling clone helper since that does some sanity
3631 // checks that confirm we haven't decided yet that we need cloning.
3632 if (AllocNode.Versions.size() == 1) {
3633 assert((AllocationType)AllocNode.Versions[0] ==
3634 AllocationType::NotCold ||
3635 (AllocationType)AllocNode.Versions[0] ==
3636 AllocationType::None);
3637 UnclonableAllocsThinBackend++;
3638 continue;
3639 }
3640
3641 // All versions should have a singular allocation type.
3642 assert(llvm::none_of(AllocNode.Versions, [](uint8_t Type) {
3643 return Type == ((uint8_t)AllocationType::NotCold |
3644 (uint8_t)AllocationType::Cold);
3645 }));
3646
3647 // Update the allocation types per the summary info.
3648 for (unsigned J = 0; J < AllocNode.Versions.size(); J++) {
3649 // Ignore any that didn't get an assigned allocation type.
3650 if (AllocNode.Versions[J] == (uint8_t)AllocationType::None)
3651 continue;
3652 AllocationType AllocTy = (AllocationType)AllocNode.Versions[J];
3653 AllocTy == AllocationType::Cold ? AllocTypeColdThinBackend++
3654 : AllocTypeNotColdThinBackend++;
3655 std::string AllocTypeString = getAllocTypeAttributeString(Type: AllocTy);
3656 auto A = llvm::Attribute::get(Context&: F.getContext(), Kind: "memprof",
3657 Val: AllocTypeString);
3658 CallBase *CBClone;
3659 // Copy 0 is the original function.
3660 if (!J)
3661 CBClone = CB;
3662 else
3663 // Since VMaps are only created for new clones, we index with
3664 // clone J-1 (J==0 is the original clone and does not have a VMaps
3665 // entry).
3666 CBClone = cast<CallBase>(Val&: (*VMaps[J - 1])[CB]);
3667 CBClone->addFnAttr(Attr: A);
3668 ORE.emit(OptDiag&: OptimizationRemark(DEBUG_TYPE, "MemprofAttribute", CBClone)
3669 << ore::NV("AllocationCall", CBClone) << " in clone "
3670 << ore::NV("Caller", CBClone->getFunction())
3671 << " marked with memprof allocation attribute "
3672 << ore::NV("Attribute", AllocTypeString));
3673 }
3674 } else if (!CallsiteContext.empty()) {
3675 // Consult the next callsite node.
3676 assert(SI != FS->callsites().end());
3677 auto &StackNode = *(SI++);
3678
3679#ifndef NDEBUG
3680 // Sanity check that the stack ids match between the summary and
3681 // instruction metadata.
3682 auto StackIdIndexIter = StackNode.StackIdIndices.begin();
3683 for (auto StackId : CallsiteContext) {
3684 assert(StackIdIndexIter != StackNode.StackIdIndices.end());
3685 assert(ImportSummary->getStackIdAtIndex(*StackIdIndexIter) ==
3686 StackId);
3687 StackIdIndexIter++;
3688 }
3689#endif
3690
3691 CloneCallsite(StackNode, CB, CalledFunction);
3692 } else if (CB->isTailCall()) {
3693 // Locate the synthesized callsite info for the callee VI, if any was
3694 // created, and use that for cloning.
3695 ValueInfo CalleeVI =
3696 findValueInfoForFunc(F: *CalledFunction, M, ImportSummary);
3697 if (CalleeVI && MapTailCallCalleeVIToCallsite.count(Val: CalleeVI)) {
3698 auto Callsite = MapTailCallCalleeVIToCallsite.find(Val: CalleeVI);
3699 assert(Callsite != MapTailCallCalleeVIToCallsite.end());
3700 CloneCallsite(Callsite->second, CB, CalledFunction);
3701 }
3702 }
3703 // Memprof and callsite metadata on memory allocations no longer needed.
3704 I.setMetadata(KindID: LLVMContext::MD_memprof, Node: nullptr);
3705 I.setMetadata(KindID: LLVMContext::MD_callsite, Node: nullptr);
3706 }
3707 }
3708 }
3709
3710 return Changed;
3711}
3712
3713template <typename DerivedCCG, typename FuncTy, typename CallTy>
3714bool CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::process() {
3715 if (DumpCCG) {
3716 dbgs() << "CCG before cloning:\n";
3717 dbgs() << *this;
3718 }
3719 if (ExportToDot)
3720 exportToDot(Label: "postbuild");
3721
3722 if (VerifyCCG) {
3723 check();
3724 }
3725
3726 identifyClones();
3727
3728 if (VerifyCCG) {
3729 check();
3730 }
3731
3732 if (DumpCCG) {
3733 dbgs() << "CCG after cloning:\n";
3734 dbgs() << *this;
3735 }
3736 if (ExportToDot)
3737 exportToDot(Label: "cloned");
3738
3739 bool Changed = assignFunctions();
3740
3741 if (DumpCCG) {
3742 dbgs() << "CCG after assigning function clones:\n";
3743 dbgs() << *this;
3744 }
3745 if (ExportToDot)
3746 exportToDot(Label: "clonefuncassign");
3747
3748 return Changed;
3749}
3750
3751bool MemProfContextDisambiguation::processModule(
3752 Module &M,
3753 llvm::function_ref<OptimizationRemarkEmitter &(Function *)> OREGetter) {
3754
3755 // If we have an import summary, then the cloning decisions were made during
3756 // the thin link on the index. Apply them and return.
3757 if (ImportSummary)
3758 return applyImport(M);
3759
3760 // TODO: If/when other types of memprof cloning are enabled beyond just for
3761 // hot and cold, we will need to change this to individually control the
3762 // AllocationType passed to addStackNodesForMIB during CCG construction.
3763 // Note that we specifically check this after applying imports above, so that
3764 // the option isn't needed to be passed to distributed ThinLTO backend
3765 // clang processes, which won't necessarily have visibility into the linker
3766 // dependences. Instead the information is communicated from the LTO link to
3767 // the backends via the combined summary index.
3768 if (!SupportsHotColdNew)
3769 return false;
3770
3771 ModuleCallsiteContextGraph CCG(M, OREGetter);
3772 return CCG.process();
3773}
3774
3775MemProfContextDisambiguation::MemProfContextDisambiguation(
3776 const ModuleSummaryIndex *Summary)
3777 : ImportSummary(Summary) {
3778 if (ImportSummary) {
3779 // The MemProfImportSummary should only be used for testing ThinLTO
3780 // distributed backend handling via opt, in which case we don't have a
3781 // summary from the pass pipeline.
3782 assert(MemProfImportSummary.empty());
3783 return;
3784 }
3785 if (MemProfImportSummary.empty())
3786 return;
3787
3788 auto ReadSummaryFile =
3789 errorOrToExpected(EO: MemoryBuffer::getFile(Filename: MemProfImportSummary));
3790 if (!ReadSummaryFile) {
3791 logAllUnhandledErrors(E: ReadSummaryFile.takeError(), OS&: errs(),
3792 ErrorBanner: "Error loading file '" + MemProfImportSummary +
3793 "': ");
3794 return;
3795 }
3796 auto ImportSummaryForTestingOrErr = getModuleSummaryIndex(Buffer: **ReadSummaryFile);
3797 if (!ImportSummaryForTestingOrErr) {
3798 logAllUnhandledErrors(E: ImportSummaryForTestingOrErr.takeError(), OS&: errs(),
3799 ErrorBanner: "Error parsing file '" + MemProfImportSummary +
3800 "': ");
3801 return;
3802 }
3803 ImportSummaryForTesting = std::move(*ImportSummaryForTestingOrErr);
3804 ImportSummary = ImportSummaryForTesting.get();
3805}
3806
3807PreservedAnalyses MemProfContextDisambiguation::run(Module &M,
3808 ModuleAnalysisManager &AM) {
3809 auto &FAM = AM.getResult<FunctionAnalysisManagerModuleProxy>(IR&: M).getManager();
3810 auto OREGetter = [&](Function *F) -> OptimizationRemarkEmitter & {
3811 return FAM.getResult<OptimizationRemarkEmitterAnalysis>(IR&: *F);
3812 };
3813 if (!processModule(M, OREGetter))
3814 return PreservedAnalyses::all();
3815 return PreservedAnalyses::none();
3816}
3817
3818void MemProfContextDisambiguation::run(
3819 ModuleSummaryIndex &Index,
3820 llvm::function_ref<bool(GlobalValue::GUID, const GlobalValueSummary *)>
3821 isPrevailing) {
3822 // TODO: If/when other types of memprof cloning are enabled beyond just for
3823 // hot and cold, we will need to change this to individually control the
3824 // AllocationType passed to addStackNodesForMIB during CCG construction.
3825 // The index was set from the option, so these should be in sync.
3826 assert(Index.withSupportsHotColdNew() == SupportsHotColdNew);
3827 if (!SupportsHotColdNew)
3828 return;
3829
3830 IndexCallsiteContextGraph CCG(Index, isPrevailing);
3831 CCG.process();
3832}
3833

source code of llvm/lib/Transforms/IPO/MemProfContextDisambiguation.cpp