MemProfContextDisambiguation.cpp source code [llvm/lib/Transforms/IPO/MemProfContextDisambiguation.cpp]

1	//==-- MemProfContextDisambiguation.cpp - Disambiguate contexts -------------=//
2	//
3	// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4	// See https://llvm.org/LICENSE.txt for license information.
5	// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6	//
7	//===----------------------------------------------------------------------===//
8	//
9	// This file implements support for context disambiguation of allocation
10	// calls for profile guided heap optimization. Specifically, it uses Memprof
11	// profiles which indicate context specific allocation behavior (currently
12	// distinguishing cold vs hot memory allocations). Cloning is performed to
13	// expose the cold allocation call contexts, and the allocation calls are
14	// subsequently annotated with an attribute for later transformation.
15	//
16	// The transformations can be performed either directly on IR (regular LTO), or
17	// on a ThinLTO index (and later applied to the IR during the ThinLTO backend).
18	// Both types of LTO operate on a the same base graph representation, which
19	// uses CRTP to support either IR or Index formats.
20	//
21	//===----------------------------------------------------------------------===//
22
23	#include "llvm/Transforms/IPO/MemProfContextDisambiguation.h"
24	#include "llvm/ADT/DenseMap.h"
25	#include "llvm/ADT/DenseSet.h"
26	#include "llvm/ADT/MapVector.h"
27	#include "llvm/ADT/SetOperations.h"
28	#include "llvm/ADT/SmallPtrSet.h"
29	#include "llvm/ADT/SmallSet.h"
30	#include "llvm/ADT/SmallVector.h"
31	#include "llvm/ADT/Statistic.h"
32	#include "llvm/Analysis/MemoryProfileInfo.h"
33	#include "llvm/Analysis/ModuleSummaryAnalysis.h"
34	#include "llvm/Analysis/OptimizationRemarkEmitter.h"
35	#include "llvm/Bitcode/BitcodeReader.h"
36	#include "llvm/IR/Constants.h"
37	#include "llvm/IR/Instructions.h"
38	#include "llvm/IR/Module.h"
39	#include "llvm/IR/ModuleSummaryIndex.h"
40	#include "llvm/Pass.h"
41	#include "llvm/Support/CommandLine.h"
42	#include "llvm/Support/FileSystem.h"
43	#include "llvm/Support/GraphWriter.h"
44	#include "llvm/Support/raw_ostream.h"
45	#include "llvm/Transforms/IPO.h"
46	#include "llvm/Transforms/Utils/Cloning.h"
47	#include <sstream>
48	#include <unordered_map>
49	#include <vector>
50	using namespace llvm;
51	using namespace llvm::memprof;
52
53	#define DEBUG_TYPE "memprof-context-disambiguation"
54
55	STATISTIC(FunctionClonesAnalysis,
56	"Number of function clones created during whole program analysis");
57	STATISTIC(FunctionClonesThinBackend,
58	"Number of function clones created during ThinLTO backend");
59	STATISTIC(FunctionsClonedThinBackend,
60	"Number of functions that had clones created during ThinLTO backend");
61	STATISTIC(AllocTypeNotCold, "Number of not cold static allocations (possibly "
62	"cloned) during whole program analysis");
63	STATISTIC(AllocTypeCold, "Number of cold static allocations (possibly cloned) "
64	"during whole program analysis");
65	STATISTIC(AllocTypeNotColdThinBackend,
66	"Number of not cold static allocations (possibly cloned) during "
67	"ThinLTO backend");
68	STATISTIC(AllocTypeColdThinBackend, "Number of cold static allocations "
69	"(possibly cloned) during ThinLTO backend");
70	STATISTIC(OrigAllocsThinBackend,
71	"Number of original (not cloned) allocations with memprof profiles "
72	"during ThinLTO backend");
73	STATISTIC(
74	AllocVersionsThinBackend,
75	"Number of allocation versions (including clones) during ThinLTO backend");
76	STATISTIC(MaxAllocVersionsThinBackend,
77	"Maximum number of allocation versions created for an original "
78	"allocation during ThinLTO backend");
79	STATISTIC(UnclonableAllocsThinBackend,
80	"Number of unclonable ambigous allocations during ThinLTO backend");
81	STATISTIC(RemovedEdgesWithMismatchedCallees,
82	"Number of edges removed due to mismatched callees (profiled vs IR)");
83	STATISTIC(FoundProfiledCalleeCount,
84	"Number of profiled callees found via tail calls");
85	STATISTIC(FoundProfiledCalleeDepth,
86	"Aggregate depth of profiled callees found via tail calls");
87	STATISTIC(FoundProfiledCalleeMaxDepth,
88	"Maximum depth of profiled callees found via tail calls");
89	STATISTIC(FoundProfiledCalleeNonUniquelyCount,
90	"Number of profiled callees found via multiple tail call chains");
91
92	static cl::opt<std::string> DotFilePathPrefix(
93	"memprof-dot-file-path-prefix", cl::init(Val: ""), cl::Hidden,
94	cl::value_desc ("filename"),
95	cl::desc ("Specify the path prefix of the MemProf dot files."));
96
97	static cl::opt<bool> ExportToDot("memprof-export-to-dot", cl::init(Val: false),
98	cl::Hidden,
99	cl::desc ("Export graph to dot files."));
100
101	static cl::opt<bool>
102	DumpCCG("memprof-dump-ccg", cl::init(Val: false), cl::Hidden,
103	cl::desc ("Dump CallingContextGraph to stdout after each stage."));
104
105	static cl::opt<bool>
106	VerifyCCG("memprof-verify-ccg", cl::init(Val: false), cl::Hidden,
107	cl::desc ("Perform verification checks on CallingContextGraph."));
108
109	static cl::opt<bool>
110	VerifyNodes("memprof-verify-nodes", cl::init(Val: false), cl::Hidden,
111	cl::desc ("Perform frequent verification checks on nodes."));
112
113	static cl::opt<std::string> MemProfImportSummary(
114	"memprof-import-summary",
115	cl::desc ("Import summary to use for testing the ThinLTO backend via opt"),
116	cl::Hidden);
117
118	static cl::opt<unsigned>
119	TailCallSearchDepth("memprof-tail-call-search-depth", cl::init(Val: `5`),
120	cl::Hidden,
121	cl::desc ("Max depth to recursively search for missing "
122	"frames through tail calls."));
123
124	namespace llvm {
125	cl::opt<bool> EnableMemProfContextDisambiguation(
126	"enable-memprof-context-disambiguation", cl::init(Val: false), cl::Hidden,
127	cl::ZeroOrMore, cl::desc ("Enable MemProf context disambiguation"));
128
129	// Indicate we are linking with an allocator that supports hot/cold operator
130	// new interfaces.
131	cl::opt<bool> SupportsHotColdNew(
132	"supports-hot-cold-new", cl::init(Val: false), cl::Hidden,
133	cl::desc ("Linking with hot/cold operator new interfaces"));
134	} // namespace llvm
135
136	namespace {
137	/// CRTP base for graphs built from either IR or ThinLTO summary index.
138	///
139	/// The graph represents the call contexts in all memprof metadata on allocation
140	/// calls, with nodes for the allocations themselves, as well as for the calls
141	/// in each context. The graph is initially built from the allocation memprof
142	/// metadata (or summary) MIBs. It is then updated to match calls with callsite
143	/// metadata onto the nodes, updating it to reflect any inlining performed on
144	/// those calls.
145	///
146	/// Each MIB (representing an allocation's call context with allocation
147	/// behavior) is assigned a unique context id during the graph build. The edges
148	/// and nodes in the graph are decorated with the context ids they carry. This
149	/// is used to correctly update the graph when cloning is performed so that we
150	/// can uniquify the context for a single (possibly cloned) allocation.
151	template <typename DerivedCCG, typename FuncTy, typename CallTy>
152	class CallsiteContextGraph {
153	public:
154	CallsiteContextGraph() = default;
155	CallsiteContextGraph(const CallsiteContextGraph &) = default;
156	CallsiteContextGraph(CallsiteContextGraph &&) = default;
157
158	/// Main entry point to perform analysis and transformations on graph.
159	bool process();
160
161	/// Perform cloning on the graph necessary to uniquely identify the allocation
162	/// behavior of an allocation based on its context.
163	void identifyClones();
164
165	/// Assign callsite clones to functions, cloning functions as needed to
166	/// accommodate the combinations of their callsite clones reached by callers.
167	/// For regular LTO this clones functions and callsites in the IR, but for
168	/// ThinLTO the cloning decisions are noted in the summaries and later applied
169	/// in applyImport.
170	bool assignFunctions();
171
172	void dump() const;
173	void print(raw_ostream &OS) const;
174
175	friend raw_ostream &operator<<(raw_ostream &OS,
176	const CallsiteContextGraph &CCG) {
177	CCG.print(OS);
178	return OS;
179	}
180
181	friend struct GraphTraits<
182	const CallsiteContextGraph<DerivedCCG, FuncTy, CallTy> *>;
183	friend struct DOTGraphTraits<
184	const CallsiteContextGraph<DerivedCCG, FuncTy, CallTy> *>;
185
186	void exportToDot(std::string Label) const;
187
188	/// Represents a function clone via FuncTy pointer and clone number pair.
189	struct FuncInfo final
190	: public std::pair<FuncTy , unsigned* /Clone number/> {
191	using Base = std::pair<FuncTy , unsigned*>;
192	FuncInfo(const Base &B) : Base(B) {}
193	FuncInfo(FuncTy F = nullptr, unsigned* CloneNo = `0`) : Base(F, CloneNo) {}
194	explicit operator bool() const { return this->first != nullptr; }
195	FuncTy func() const* { return this->first; }
196	unsigned cloneNo() const { return this->second; }
197	};
198
199	/// Represents a callsite clone via CallTy and clone number pair.
200	struct CallInfo final : public std::pair<CallTy, unsigned /Clone number/> {
201	using Base = std::pair<CallTy, unsigned>;
202	CallInfo(const Base &B) : Base(B) {}
203	CallInfo(CallTy Call = nullptr, unsigned CloneNo = `0`)
204	: Base(Call, CloneNo) {}
205	explicit operator bool() const { return (bool)this->first; }
206	CallTy call() const { return this->first; }
207	unsigned cloneNo() const { return this->second; }
208	void setCloneNo(unsigned N) { this->second = N; }
209	void print(raw_ostream &OS) const {
210	if (!operator bool()) {
211	assert(!cloneNo());
212	OS << "null Call";
213	return;
214	}
215	call()->print(OS);
216	OS << "\t(clone " << cloneNo() << ")";
217	}
218	void dump() const {
219	print(OS&: dbgs());
220	dbgs() << "\n";
221	}
222	friend raw_ostream &operator<<(raw_ostream &OS, const CallInfo &Call) {
223	Call.print(OS);
224	return OS;
225	}
226	};
227
228	struct ContextEdge;
229
230	/// Node in the Callsite Context Graph
231	struct ContextNode {
232	// Keep this for now since in the IR case where we have an Instruction it*
233	// is not as immediately discoverable. Used for printing richer information
234	// when dumping graph.
235	bool IsAllocation;
236
237	// Keeps track of when the Call was reset to null because there was
238	// recursion.
239	bool Recursive = false;
240
241	// The corresponding allocation or interior call.
242	CallInfo Call;
243
244	// For alloc nodes this is a unique id assigned when constructed, and for
245	// callsite stack nodes it is the original stack id when the node is
246	// constructed from the memprof MIB metadata on the alloc nodes. Note that
247	// this is only used when matching callsite metadata onto the stack nodes
248	// created when processing the allocation memprof MIBs, and for labeling
249	// nodes in the dot graph. Therefore we don't bother to assign a value for
250	// clones.
251	uint64_t OrigStackOrAllocId = `0`;
252
253	// This will be formed by ORing together the AllocationType enum values
254	// for contexts including this node.
255	uint8_t AllocTypes = `0`;
256
257	// Edges to all callees in the profiled call stacks.
258	// TODO: Should this be a map (from Callee node) for more efficient lookup?
259	std::vector<std::shared_ptr<ContextEdge>> CalleeEdges;
260
261	// Edges to all callers in the profiled call stacks.
262	// TODO: Should this be a map (from Caller node) for more efficient lookup?
263	std::vector<std::shared_ptr<ContextEdge>> CallerEdges;
264
265	// The set of IDs for contexts including this node.
266	DenseSet<uint32_t> ContextIds;
267
268	// List of clones of this ContextNode, initially empty.
269	std::vector<ContextNode *> Clones;
270
271	// If a clone, points to the original uncloned node.
272	ContextNode CloneOf = nullptr*;
273
274	ContextNode(bool IsAllocation) : IsAllocation(IsAllocation), Call() {}
275
276	ContextNode(bool IsAllocation, CallInfo C)
277	: IsAllocation(IsAllocation), Call(C) {}
278
279	void addClone(ContextNode *Clone) {
280	if (CloneOf) {
281	CloneOf->Clones.push_back(Clone);
282	Clone->CloneOf = CloneOf;
283	} else {
284	Clones.push_back(Clone);
285	assert(!Clone->CloneOf);
286	Clone->CloneOf = this;
287	}
288	}
289
290	ContextNode *getOrigNode() {
291	if (!CloneOf)
292	return this;
293	return CloneOf;
294	}
295
296	void addOrUpdateCallerEdge(ContextNode *Caller, AllocationType AllocType,
297	unsigned int ContextId);
298
299	ContextEdge findEdgeFromCallee(const* ContextNode *Callee);
300	ContextEdge findEdgeFromCaller(const* ContextNode *Caller);
301	void eraseCalleeEdge(const ContextEdge *Edge);
302	void eraseCallerEdge(const ContextEdge *Edge);
303
304	void setCall(CallInfo C) { Call = C; }
305
306	bool hasCall() const { return (bool)Call.call(); }
307
308	void printCall(raw_ostream &OS) const { Call.print(OS); }
309
310	// True if this node was effectively removed from the graph, in which case
311	// its context id set, caller edges, and callee edges should all be empty.
312	bool isRemoved() const {
313	// Note that we can have non-empty context ids with empty caller and
314	// callee edges if the graph ends up with a single node.
315	if (ContextIds.empty())
316	assert(CalleeEdges.empty() && CallerEdges.empty() &&
317	"Context ids empty but at least one of callee and caller edges "
318	"were not!");
319	return ContextIds.empty();
320	}
321
322	void dump() const;
323	void print(raw_ostream &OS) const;
324
325	friend raw_ostream &operator<<(raw_ostream &OS, const ContextNode &Node) {
326	Node.print(OS);
327	return OS;
328	}
329	};
330
331	/// Edge in the Callsite Context Graph from a ContextNode N to a caller or
332	/// callee.
333	struct ContextEdge {
334	ContextNode *Callee;
335	ContextNode *Caller;
336
337	// This will be formed by ORing together the AllocationType enum values
338	// for contexts including this edge.
339	uint8_t AllocTypes = `0`;
340
341	// The set of IDs for contexts including this edge.
342	DenseSet<uint32_t> ContextIds;
343
344	ContextEdge(ContextNode Callee, ContextNode Caller, uint8_t AllocType,
345	DenseSet<uint32_t> ContextIds)
346	: Callee(Callee), Caller(Caller), AllocTypes(AllocType),
347	ContextIds (ContextIds) {}
348
349	DenseSet<uint32_t> &getContextIds() { return ContextIds; }
350
351	void dump() const;
352	void print(raw_ostream &OS) const;
353
354	friend raw_ostream &operator<<(raw_ostream &OS, const ContextEdge &Edge) {
355	Edge.print(OS);
356	return OS;
357	}
358	};
359
360	/// Helpers to remove callee edges that have allocation type None (due to not
361	/// carrying any context ids) after transformations.
362	void removeNoneTypeCalleeEdges(ContextNode *Node);
363	void
364	recursivelyRemoveNoneTypeCalleeEdges(ContextNode *Node,
365	DenseSet<const ContextNode *> &Visited);
366
367	protected:
368	/// Get a list of nodes corresponding to the stack ids in the given callsite
369	/// context.
370	template <class NodeT, class IteratorT>
371	std::vector<uint64_t>
372	getStackIdsWithContextNodes(CallStack<NodeT, IteratorT> &CallsiteContext);
373
374	/// Adds nodes for the given allocation and any stack ids on its memprof MIB
375	/// metadata (or summary).
376	ContextNode addAllocNode(CallInfo Call, const* FuncTy *F);
377
378	/// Adds nodes for the given MIB stack ids.
379	template <class NodeT, class IteratorT>
380	void addStackNodesForMIB(ContextNode *AllocNode,
381	CallStack<NodeT, IteratorT> &StackContext,
382	CallStack<NodeT, IteratorT> &CallsiteContext,
383	AllocationType AllocType);
384
385	/// Matches all callsite metadata (or summary) to the nodes created for
386	/// allocation memprof MIB metadata, synthesizing new nodes to reflect any
387	/// inlining performed on those callsite instructions.
388	void updateStackNodes();
389
390	/// Update graph to conservatively handle any callsite stack nodes that target
391	/// multiple different callee target functions.
392	void handleCallsitesWithMultipleTargets();
393
394	/// Save lists of calls with MemProf metadata in each function, for faster
395	/// iteration.
396	MapVector<FuncTy *, std::vector<CallInfo>> FuncToCallsWithMetadata;
397
398	/// Map from callsite node to the enclosing caller function.
399	std::map<const ContextNode , const* FuncTy *> NodeToCallingFunc;
400
401	private:
402	using EdgeIter = typename std::vector<std::shared_ptr<ContextEdge>>::iterator;
403
404	using CallContextInfo = std::tuple<CallTy, std::vector<uint64_t>,
405	const FuncTy *, DenseSet<uint32_t>>;
406
407	/// Assigns the given Node to calls at or inlined into the location with
408	/// the Node's stack id, after post order traversing and processing its
409	/// caller nodes. Uses the call information recorded in the given
410	/// StackIdToMatchingCalls map, and creates new nodes for inlined sequences
411	/// as needed. Called by updateStackNodes which sets up the given
412	/// StackIdToMatchingCalls map.
413	void assignStackNodesPostOrder(
414	ContextNode Node, DenseSet<const* ContextNode *> &Visited,
415	DenseMap<uint64_t, std::vector<CallContextInfo>> &StackIdToMatchingCalls);
416
417	/// Duplicates the given set of context ids, updating the provided
418	/// map from each original id with the newly generated context ids,
419	/// and returning the new duplicated id set.
420	DenseSet<uint32_t> duplicateContextIds(
421	const DenseSet<uint32_t> &StackSequenceContextIds,
422	DenseMap<uint32_t, DenseSet<uint32_t>> &OldToNewContextIds);
423
424	/// Propagates all duplicated context ids across the graph.
425	void propagateDuplicateContextIds(
426	const DenseMap<uint32_t, DenseSet<uint32_t>> &OldToNewContextIds);
427
428	/// Connect the NewNode to OrigNode's callees if TowardsCallee is true,
429	/// else to its callers. Also updates OrigNode's edges to remove any context
430	/// ids moved to the newly created edge.
431	void connectNewNode(ContextNode NewNode, ContextNode OrigNode,
432	bool TowardsCallee);
433
434	/// Get the stack id corresponding to the given Id or Index (for IR this will
435	/// return itself, for a summary index this will return the id recorded in the
436	/// index for that stack id index value).
437	uint64_t getStackId(uint64_t IdOrIndex) const {
438	return static_cast<const DerivedCCG >(this*)->getStackId(IdOrIndex);
439	}
440
441	/// Returns true if the given call targets the callee of the given edge, or if
442	/// we were able to identify the call chain through intermediate tail calls.
443	/// In the latter case new context nodes are added to the graph for the
444	/// identified tail calls, and their synthesized nodes are added to
445	/// TailCallToContextNodeMap. The EdgeIter is updated in either case to the
446	/// next element after the input position (either incremented or updated after
447	/// removing the old edge).
448	bool
449	calleesMatch(CallTy Call, EdgeIter &EI,
450	MapVector<CallInfo, ContextNode *> &TailCallToContextNodeMap);
451
452	/// Returns true if the given call targets the given function, or if we were
453	/// able to identify the call chain through intermediate tail calls (in which
454	/// case FoundCalleeChain will be populated).
455	bool calleeMatchesFunc(
456	CallTy Call, const FuncTy Func, const* FuncTy *CallerFunc,
457	std::vector<std::pair<CallTy, FuncTy *>> &FoundCalleeChain) {
458	return static_cast<DerivedCCG >(this*)->calleeMatchesFunc(
459	Call, Func, CallerFunc, FoundCalleeChain);
460	}
461
462	/// Get a list of nodes corresponding to the stack ids in the given
463	/// callsite's context.
464	std::vector<uint64_t> getStackIdsWithContextNodesForCall(CallTy Call) {
465	return static_cast<DerivedCCG >(this*)->getStackIdsWithContextNodesForCall(
466	Call);
467	}
468
469	/// Get the last stack id in the context for callsite.
470	uint64_t getLastStackId(CallTy Call) {
471	return static_cast<DerivedCCG >(this*)->getLastStackId(Call);
472	}
473
474	/// Update the allocation call to record type of allocated memory.
475	void updateAllocationCall(CallInfo &Call, AllocationType AllocType) {
476	AllocType == AllocationType::Cold ? AllocTypeCold ++ : AllocTypeNotCold ++;
477	static_cast<DerivedCCG >(this*)->updateAllocationCall(Call, AllocType);
478	}
479
480	/// Update non-allocation call to invoke (possibly cloned) function
481	/// CalleeFunc.
482	void updateCall(CallInfo &CallerCall, FuncInfo CalleeFunc) {
483	static_cast<DerivedCCG >(this*)->updateCall(CallerCall, CalleeFunc);
484	}
485
486	/// Clone the given function for the given callsite, recording mapping of all
487	/// of the functions tracked calls to their new versions in the CallMap.
488	/// Assigns new clones to clone number CloneNo.
489	FuncInfo cloneFunctionForCallsite(
490	FuncInfo &Func, CallInfo &Call, std::map<CallInfo, CallInfo> &CallMap,
491	std::vector<CallInfo> &CallsWithMetadataInFunc, unsigned CloneNo) {
492	return static_cast<DerivedCCG >(this*)->cloneFunctionForCallsite(
493	Func, Call, CallMap, CallsWithMetadataInFunc, CloneNo);
494	}
495
496	/// Gets a label to use in the dot graph for the given call clone in the given
497	/// function.
498	std::string getLabel(const FuncTy Func, const* CallTy Call,
499	unsigned CloneNo) const {
500	return static_cast<const DerivedCCG >(this*)->getLabel(Func, Call, CloneNo);
501	}
502
503	/// Helpers to find the node corresponding to the given call or stackid.
504	ContextNode getNodeForInst(const* CallInfo &C);
505	ContextNode getNodeForAlloc(const* CallInfo &C);
506	ContextNode *getNodeForStackId(uint64_t StackId);
507
508	/// Removes the node information recorded for the given call.
509	void unsetNodeForInst(const CallInfo &C);
510
511	/// Computes the alloc type corresponding to the given context ids, by
512	/// unioning their recorded alloc types.
513	uint8_t computeAllocType(DenseSet<uint32_t> &ContextIds);
514
515	/// Returns the alloction type of the intersection of the contexts of two
516	/// nodes (based on their provided context id sets), optimized for the case
517	/// when Node1Ids is smaller than Node2Ids.
518	uint8_t intersectAllocTypesImpl(const DenseSet<uint32_t> &Node1Ids,
519	const DenseSet<uint32_t> &Node2Ids);
520
521	/// Returns the alloction type of the intersection of the contexts of two
522	/// nodes (based on their provided context id sets).
523	uint8_t intersectAllocTypes(const DenseSet<uint32_t> &Node1Ids,
524	const DenseSet<uint32_t> &Node2Ids);
525
526	/// Create a clone of Edge's callee and move Edge to that new callee node,
527	/// performing the necessary context id and allocation type updates.
528	/// If callee's caller edge iterator is supplied, it is updated when removing
529	/// the edge from that list. If ContextIdsToMove is non-empty, only that
530	/// subset of Edge's ids are moved to an edge to the new callee.
531	ContextNode *
532	moveEdgeToNewCalleeClone(const std::shared_ptr<ContextEdge> &Edge,
533	EdgeIter CallerEdgeI = nullptr*,
534	DenseSet<uint32_t> ContextIdsToMove = {});
535
536	/// Change the callee of Edge to existing callee clone NewCallee, performing
537	/// the necessary context id and allocation type updates.
538	/// If callee's caller edge iterator is supplied, it is updated when removing
539	/// the edge from that list. If ContextIdsToMove is non-empty, only that
540	/// subset of Edge's ids are moved to an edge to the new callee.
541	void moveEdgeToExistingCalleeClone(const std::shared_ptr<ContextEdge> &Edge,
542	ContextNode *NewCallee,
543	EdgeIter CallerEdgeI = nullptr*,
544	bool NewClone = false,
545	DenseSet<uint32_t> ContextIdsToMove = {});
546
547	/// Recursively perform cloning on the graph for the given Node and its
548	/// callers, in order to uniquely identify the allocation behavior of an
549	/// allocation given its context. The context ids of the allocation being
550	/// processed are given in AllocContextIds.
551	void identifyClones(ContextNode Node, DenseSet<const* ContextNode *> &Visited,
552	const DenseSet<uint32_t> &AllocContextIds);
553
554	/// Map from each context ID to the AllocationType assigned to that context.
555	std::map<uint32_t, AllocationType> ContextIdToAllocationType;
556
557	/// Identifies the context node created for a stack id when adding the MIB
558	/// contexts to the graph. This is used to locate the context nodes when
559	/// trying to assign the corresponding callsites with those stack ids to these
560	/// nodes.
561	std::map<uint64_t, ContextNode *> StackEntryIdToContextNodeMap;
562
563	/// Maps to track the calls to their corresponding nodes in the graph.
564	MapVector<CallInfo, ContextNode *> AllocationCallToContextNodeMap;
565	MapVector<CallInfo, ContextNode *> NonAllocationCallToContextNodeMap;
566
567	/// Owner of all ContextNode unique_ptrs.
568	std::vector<std::unique_ptr<ContextNode>> NodeOwner;
569
570	/// Perform sanity checks on graph when requested.
571	void check() const;
572
573	/// Keeps track of the last unique context id assigned.
574	unsigned int LastContextId = `0`;
575	};
576
577	template <typename DerivedCCG, typename FuncTy, typename CallTy>
578	using ContextNode =
579	typename CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::ContextNode;
580	template <typename DerivedCCG, typename FuncTy, typename CallTy>
581	using ContextEdge =
582	typename CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::ContextEdge;
583	template <typename DerivedCCG, typename FuncTy, typename CallTy>
584	using FuncInfo =
585	typename CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::FuncInfo;
586	template <typename DerivedCCG, typename FuncTy, typename CallTy>
587	using CallInfo =
588	typename CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::CallInfo;
589
590	/// CRTP derived class for graphs built from IR (regular LTO).
591	class ModuleCallsiteContextGraph
592	: public CallsiteContextGraph<ModuleCallsiteContextGraph, Function,
593	Instruction *> {
594	public:
595	ModuleCallsiteContextGraph(
596	Module &M,
597	llvm::function_ref<OptimizationRemarkEmitter &(Function *)> OREGetter);
598
599	private:
600	friend CallsiteContextGraph<ModuleCallsiteContextGraph, Function,
601	Instruction *>;
602
603	uint64_t getStackId(uint64_t IdOrIndex) const;
604	bool calleeMatchesFunc(
605	Instruction Call, const* Function Func, const* Function *CallerFunc,
606	std::vector<std::pair<Instruction , Function >> &FoundCalleeChain);
607	bool findProfiledCalleeThroughTailCalls(
608	const Function ProfiledCallee, Value CurCallee, unsigned Depth,
609	std::vector<std::pair<Instruction , Function >> &FoundCalleeChain,
610	bool &FoundMultipleCalleeChains);
611	uint64_t getLastStackId(Instruction *Call);
612	std::vector<uint64_t> getStackIdsWithContextNodesForCall(Instruction *Call);
613	void updateAllocationCall(CallInfo &Call, AllocationType AllocType);
614	void updateCall(CallInfo &CallerCall, FuncInfo CalleeFunc);
615	CallsiteContextGraph<ModuleCallsiteContextGraph, Function,
616	Instruction *>::FuncInfo
617	cloneFunctionForCallsite(FuncInfo &Func, CallInfo &Call,
618	std::map<CallInfo, CallInfo> &CallMap,
619	std::vector<CallInfo> &CallsWithMetadataInFunc,
620	unsigned CloneNo);
621	std::string getLabel(const Function Func, const* Instruction *Call,
622	unsigned CloneNo) const;
623
624	const Module &Mod;
625	llvm::function_ref<OptimizationRemarkEmitter &(Function *)> OREGetter;
626	};
627
628	/// Represents a call in the summary index graph, which can either be an
629	/// allocation or an interior callsite node in an allocation's context.
630	/// Holds a pointer to the corresponding data structure in the index.
631	struct IndexCall : public PointerUnion<CallsiteInfo , AllocInfo > {
632	IndexCall() : PointerUnion () {}
633	IndexCall(std::nullptr_t) : IndexCall () {}
634	IndexCall(CallsiteInfo *StackNode) : PointerUnion (StackNode) {}
635	IndexCall(AllocInfo *AllocNode) : PointerUnion (AllocNode) {}
636	IndexCall(PointerUnion PT) : PointerUnion (PT) {}
637
638	IndexCall *operator->() { return this; }
639
640	PointerUnion<CallsiteInfo , AllocInfo > getBase() const { return *this; }
641
642	void print(raw_ostream &OS) const {
643	if (auto AI = llvm::dyn_cast_if_present<AllocInfo >(Val: getBase())) {
644	OS << *AI;
645	} else {
646	auto CI = llvm::dyn_cast_if_present<CallsiteInfo >(Val: getBase());
647	assert(CI);
648	OS << *CI;
649	}
650	}
651	};
652
653	/// CRTP derived class for graphs built from summary index (ThinLTO).
654	class IndexCallsiteContextGraph
655	: public CallsiteContextGraph<IndexCallsiteContextGraph, FunctionSummary,
656	IndexCall> {
657	public:
658	IndexCallsiteContextGraph(
659	ModuleSummaryIndex &Index,
660	llvm::function_ref<bool(GlobalValue::GUID, const GlobalValueSummary *)>
661	isPrevailing);
662
663	~IndexCallsiteContextGraph() {
664	// Now that we are done with the graph it is safe to add the new
665	// CallsiteInfo structs to the function summary vectors. The graph nodes
666	// point into locations within these vectors, so we don't want to add them
667	// any earlier.
668	for (auto &I : FunctionCalleesToSynthesizedCallsiteInfos) {
669	auto *FS = I.first;
670	for (auto &Callsite : I.second)
671	FS->addCallsite(Callsite&: *Callsite.second);
672	}
673	}
674
675	private:
676	friend CallsiteContextGraph<IndexCallsiteContextGraph, FunctionSummary,
677	IndexCall>;
678
679	uint64_t getStackId(uint64_t IdOrIndex) const;
680	bool calleeMatchesFunc(
681	IndexCall &Call, const FunctionSummary *Func,
682	const FunctionSummary *CallerFunc,
683	std::vector<std::pair<IndexCall, FunctionSummary *>> &FoundCalleeChain);
684	bool findProfiledCalleeThroughTailCalls(
685	ValueInfo ProfiledCallee, ValueInfo CurCallee, unsigned Depth,
686	std::vector<std::pair<IndexCall, FunctionSummary *>> &FoundCalleeChain,
687	bool &FoundMultipleCalleeChains);
688	uint64_t getLastStackId(IndexCall &Call);
689	std::vector<uint64_t> getStackIdsWithContextNodesForCall(IndexCall &Call);
690	void updateAllocationCall(CallInfo &Call, AllocationType AllocType);
691	void updateCall(CallInfo &CallerCall, FuncInfo CalleeFunc);
692	CallsiteContextGraph<IndexCallsiteContextGraph, FunctionSummary,
693	IndexCall>::FuncInfo
694	cloneFunctionForCallsite(FuncInfo &Func, CallInfo &Call,
695	std::map<CallInfo, CallInfo> &CallMap,
696	std::vector<CallInfo> &CallsWithMetadataInFunc,
697	unsigned CloneNo);
698	std::string getLabel(const FunctionSummary Func, const* IndexCall &Call,
699	unsigned CloneNo) const;
700
701	// Saves mapping from function summaries containing memprof records back to
702	// its VI, for use in checking and debugging.
703	std::map<const FunctionSummary *, ValueInfo> FSToVIMap;
704
705	const ModuleSummaryIndex &Index;
706	llvm::function_ref<bool(GlobalValue::GUID, const GlobalValueSummary *)>
707	isPrevailing;
708
709	// Saves/owns the callsite info structures synthesized for missing tail call
710	// frames that we discover while building the graph.
711	// It maps from the summary of the function making the tail call, to a map
712	// of callee ValueInfo to corresponding synthesized callsite info.
713	std::unordered_map<FunctionSummary *,
714	std::map<ValueInfo, std::unique_ptr<CallsiteInfo>>>
715	FunctionCalleesToSynthesizedCallsiteInfos;
716	};
717	} // namespace
718
719	namespace llvm {
720	template <>
721	struct DenseMapInfo<typename CallsiteContextGraph<
722	ModuleCallsiteContextGraph, Function, Instruction *>::CallInfo>
723	: public DenseMapInfo<std::pair<Instruction , unsigned*>> {};
724	template <>
725	struct DenseMapInfo<typename CallsiteContextGraph<
726	IndexCallsiteContextGraph, FunctionSummary, IndexCall>::CallInfo>
727	: public DenseMapInfo<std::pair<IndexCall, unsigned>> {};
728	template <>
729	struct DenseMapInfo<IndexCall>
730	: public DenseMapInfo<PointerUnion<CallsiteInfo , AllocInfo >> {};
731	} // end namespace llvm
732
733	namespace {
734
735	struct FieldSeparator {
736	bool Skip = true;
737	const char *Sep;
738
739	FieldSeparator(const char *Sep = ", ") : Sep(Sep) {}
740	};
741
742	raw_ostream &operator<<(raw_ostream &OS, FieldSeparator &FS) {
743	if (FS.Skip) {
744	FS.Skip = false;
745	return OS;
746	}
747	return OS << FS.Sep;
748	}
749
750	// Map the uint8_t alloc types (which may contain NotCold\|Cold) to the alloc
751	// type we should actually use on the corresponding allocation.
752	// If we can't clone a node that has NotCold+Cold alloc type, we will fall
753	// back to using NotCold. So don't bother cloning to distinguish NotCold+Cold
754	// from NotCold.
755	AllocationType allocTypeToUse(uint8_t AllocTypes) {
756	assert(AllocTypes != (uint8_t)AllocationType::None);
757	if (AllocTypes ==
758	((uint8_t)AllocationType::NotCold \| (uint8_t)AllocationType::Cold))
759	return AllocationType::NotCold;
760	else
761	return (AllocationType)AllocTypes;
762	}
763
764	// Helper to check if the alloc types for all edges recorded in the
765	// InAllocTypes vector match the alloc types for all edges in the Edges
766	// vector.
767	template <typename DerivedCCG, typename FuncTy, typename CallTy>
768	bool allocTypesMatch(
769	const std::vector<uint8_t> &InAllocTypes,
770	const std::vector<std::shared_ptr<ContextEdge<DerivedCCG, FuncTy, CallTy>>>
771	&Edges) {
772	return std::equal(
773	InAllocTypes.begin(), InAllocTypes.end(), Edges.begin(),
774	[](const uint8_t &l,
775	const std::shared_ptr<ContextEdge<DerivedCCG, FuncTy, CallTy>> &r) {
776	// Can share if one of the edges is None type - don't
777	// care about the type along that edge as it doesn't
778	// exist for those context ids.
779	if (l == (uint8_t)AllocationType::None \|\|
780	r->AllocTypes == (uint8_t)AllocationType::None)
781	return true;
782	return allocTypeToUse(AllocTypes: l) == allocTypeToUse(r->AllocTypes);
783	});
784	}
785
786	} // end anonymous namespace
787
788	template <typename DerivedCCG, typename FuncTy, typename CallTy>
789	typename CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::ContextNode *
790	CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::getNodeForInst(
791	const CallInfo &C) {
792	ContextNode *Node = getNodeForAlloc(C);
793	if (Node)
794	return Node;
795
796	return NonAllocationCallToContextNodeMap.lookup(C);
797	}
798
799	template <typename DerivedCCG, typename FuncTy, typename CallTy>
800	typename CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::ContextNode *
801	CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::getNodeForAlloc(
802	const CallInfo &C) {
803	return AllocationCallToContextNodeMap.lookup(C);
804	}
805
806	template <typename DerivedCCG, typename FuncTy, typename CallTy>
807	typename CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::ContextNode *
808	CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::getNodeForStackId(
809	uint64_t StackId) {
810	auto StackEntryNode = StackEntryIdToContextNodeMap.find(StackId);
811	if (StackEntryNode != StackEntryIdToContextNodeMap.end())
812	return StackEntryNode->second;
813	return nullptr;
814	}
815
816	template <typename DerivedCCG, typename FuncTy, typename CallTy>
817	void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::unsetNodeForInst(
818	const CallInfo &C) {
819	AllocationCallToContextNodeMap.erase(C) \|\|
820	NonAllocationCallToContextNodeMap.erase(C);
821	assert(!AllocationCallToContextNodeMap.count(C) &&
822	!NonAllocationCallToContextNodeMap.count(C));
823	}
824
825	template <typename DerivedCCG, typename FuncTy, typename CallTy>
826	void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::ContextNode::
827	addOrUpdateCallerEdge(ContextNode *Caller, AllocationType AllocType,
828	unsigned int ContextId) {
829	for (auto &Edge : CallerEdges) {
830	if (Edge->Caller == Caller) {
831	Edge->AllocTypes \|= (uint8_t)AllocType;
832	Edge->getContextIds().insert(ContextId);
833	return;
834	}
835	}
836	std::shared_ptr<ContextEdge> Edge = std::make_shared<ContextEdge>(
837	this, Caller, (uint8_t)AllocType, DenseSet<uint32_t>({ContextId}));
838	CallerEdges.push_back(Edge);
839	Caller->CalleeEdges.push_back(Edge);
840	}
841
842	template <typename DerivedCCG, typename FuncTy, typename CallTy>
843	void CallsiteContextGraph<
844	DerivedCCG, FuncTy, CallTy>::removeNoneTypeCalleeEdges(ContextNode *Node) {
845	for (auto EI = Node->CalleeEdges.begin(); EI != Node->CalleeEdges.end();) {
846	auto Edge = *EI;
847	if (Edge->AllocTypes == (uint8_t)AllocationType::None) {
848	assert(Edge->ContextIds.empty());
849	Edge->Callee->eraseCallerEdge(Edge.get());
850	EI = Node->CalleeEdges.erase(EI);
851	} else
852	++EI;
853	}
854	}
855
856	template <typename DerivedCCG, typename FuncTy, typename CallTy>
857	typename CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::ContextEdge *
858	CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::ContextNode::
859	findEdgeFromCallee(const ContextNode *Callee) {
860	for (const auto &Edge : CalleeEdges)
861	if (Edge->Callee == Callee)
862	return Edge.get();
863	return nullptr;
864	}
865
866	template <typename DerivedCCG, typename FuncTy, typename CallTy>
867	typename CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::ContextEdge *
868	CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::ContextNode::
869	findEdgeFromCaller(const ContextNode *Caller) {
870	for (const auto &Edge : CallerEdges)
871	if (Edge->Caller == Caller)
872	return Edge.get();
873	return nullptr;
874	}
875
876	template <typename DerivedCCG, typename FuncTy, typename CallTy>
877	void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::ContextNode::
878	eraseCalleeEdge(const ContextEdge *Edge) {
879	auto EI = llvm::find_if(
880	CalleeEdges, [Edge](const std::shared_ptr<ContextEdge> &CalleeEdge) {
881	return CalleeEdge.get() == Edge;
882	});
883	assert(EI != CalleeEdges.end());
884	CalleeEdges.erase(EI);
885	}
886
887	template <typename DerivedCCG, typename FuncTy, typename CallTy>
888	void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::ContextNode::
889	eraseCallerEdge(const ContextEdge *Edge) {
890	auto EI = llvm::find_if(
891	CallerEdges, [Edge](const std::shared_ptr<ContextEdge> &CallerEdge) {
892	return CallerEdge.get() == Edge;
893	});
894	assert(EI != CallerEdges.end());
895	CallerEdges.erase(EI);
896	}
897
898	template <typename DerivedCCG, typename FuncTy, typename CallTy>
899	uint8_t CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::computeAllocType(
900	DenseSet<uint32_t> &ContextIds) {
901	uint8_t BothTypes =
902	(uint8_t)AllocationType::Cold \| (uint8_t)AllocationType::NotCold;
903	uint8_t AllocType = (uint8_t)AllocationType::None;
904	for (auto Id : ContextIds) {
905	AllocType \|= (uint8_t)ContextIdToAllocationType [Id];
906	// Bail early if alloc type reached both, no further refinement.
907	if (AllocType == BothTypes)
908	return AllocType;
909	}
910	return AllocType;
911	}
912
913	template <typename DerivedCCG, typename FuncTy, typename CallTy>
914	uint8_t
915	CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::intersectAllocTypesImpl(
916	const DenseSet<uint32_t> &Node1Ids, const DenseSet<uint32_t> &Node2Ids) {
917	uint8_t BothTypes =
918	(uint8_t)AllocationType::Cold \| (uint8_t)AllocationType::NotCold;
919	uint8_t AllocType = (uint8_t)AllocationType::None;
920	for (auto Id : Node1Ids) {
921	if (!Node2Ids.count(V: Id))
922	continue;
923	AllocType \|= (uint8_t)ContextIdToAllocationType [Id];
924	// Bail early if alloc type reached both, no further refinement.
925	if (AllocType == BothTypes)
926	return AllocType;
927	}
928	return AllocType;
929	}
930
931	template <typename DerivedCCG, typename FuncTy, typename CallTy>
932	uint8_t CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::intersectAllocTypes(
933	const DenseSet<uint32_t> &Node1Ids, const DenseSet<uint32_t> &Node2Ids) {
934	if (Node1Ids.size() < Node2Ids.size())
935	return intersectAllocTypesImpl(Node1Ids, Node2Ids);
936	else
937	return intersectAllocTypesImpl(Node1Ids: Node2Ids, Node2Ids: Node1Ids);
938	}
939
940	template <typename DerivedCCG, typename FuncTy, typename CallTy>
941	typename CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::ContextNode *
942	CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::addAllocNode(
943	CallInfo Call, const FuncTy *F) {
944	assert(!getNodeForAlloc(Call));
945	NodeOwner.push_back(
946	std::make_unique<ContextNode>(/IsAllocation=/true, Call));
947	ContextNode *AllocNode = NodeOwner.back().get();
948	AllocationCallToContextNodeMap[Call] = AllocNode;
949	NodeToCallingFunc[AllocNode] = F;
950	// Use LastContextId as a uniq id for MIB allocation nodes.
951	AllocNode->OrigStackOrAllocId = LastContextId;
952	// Alloc type should be updated as we add in the MIBs. We should assert
953	// afterwards that it is not still None.
954	AllocNode->AllocTypes = (uint8_t)AllocationType::None;
955
956	return AllocNode;
957	}
958
959	template <typename DerivedCCG, typename FuncTy, typename CallTy>
960	template <class NodeT, class IteratorT>
961	void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::addStackNodesForMIB(
962	ContextNode *AllocNode, CallStack<NodeT, IteratorT> &StackContext,
963	CallStack<NodeT, IteratorT> &CallsiteContext, AllocationType AllocType) {
964	// Treating the hot alloc type as NotCold before the disambiguation for "hot"
965	// is done.
966	if (AllocType == AllocationType::Hot)
967	AllocType = AllocationType::NotCold;
968
969	ContextIdToAllocationType [++LastContextId] = AllocType;
970
971	// Update alloc type and context ids for this MIB.
972	AllocNode->AllocTypes \|= (uint8_t)AllocType;
973	AllocNode->ContextIds.insert(LastContextId);
974
975	// Now add or update nodes for each stack id in alloc's context.
976	// Later when processing the stack ids on non-alloc callsites we will adjust
977	// for any inlining in the context.
978	ContextNode *PrevNode = AllocNode;
979	// Look for recursion (direct recursion should have been collapsed by
980	// module summary analysis, here we should just be detecting mutual
981	// recursion). Mark these nodes so we don't try to clone.
982	SmallSet<uint64_t, `8`> StackIdSet;
983	// Skip any on the allocation call (inlining).
984	for (auto ContextIter = StackContext.beginAfterSharedPrefix(CallsiteContext);
985	ContextIter != StackContext.end(); ++ContextIter) {
986	auto StackId = getStackId(IdOrIndex: *ContextIter);
987	ContextNode *StackNode = getNodeForStackId(StackId);
988	if (!StackNode) {
989	NodeOwner.push_back(
990	std::make_unique<ContextNode>(/IsAllocation=/false));
991	StackNode = NodeOwner.back().get();
992	StackEntryIdToContextNodeMap[StackId] = StackNode;
993	StackNode->OrigStackOrAllocId = StackId;
994	}
995	auto Ins = StackIdSet.insert(StackId);
996	if (!Ins.second)
997	StackNode->Recursive = true;
998	StackNode->ContextIds.insert(LastContextId);
999	StackNode->AllocTypes \|= (uint8_t)AllocType;
1000	PrevNode->addOrUpdateCallerEdge(StackNode, AllocType, LastContextId);
1001	PrevNode = StackNode;
1002	}
1003	}
1004
1005	template <typename DerivedCCG, typename FuncTy, typename CallTy>
1006	DenseSet<uint32_t>
1007	CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::duplicateContextIds(
1008	const DenseSet<uint32_t> &StackSequenceContextIds,
1009	DenseMap<uint32_t, DenseSet<uint32_t>> &OldToNewContextIds) {
1010	DenseSet<uint32_t> NewContextIds;
1011	for (auto OldId : StackSequenceContextIds) {
1012	NewContextIds.insert(V: ++LastContextId);
1013	OldToNewContextIds [OldId].insert(V: LastContextId);
1014	assert(ContextIdToAllocationType.count(OldId));
1015	// The new context has the same allocation type as original.
1016	ContextIdToAllocationType [LastContextId] = ContextIdToAllocationType [OldId];
1017	}
1018	return NewContextIds;
1019	}
1020
1021	template <typename DerivedCCG, typename FuncTy, typename CallTy>
1022	void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::
1023	propagateDuplicateContextIds(
1024	const DenseMap<uint32_t, DenseSet<uint32_t>> &OldToNewContextIds) {
1025	// Build a set of duplicated context ids corresponding to the input id set.
1026	auto GetNewIds = [&OldToNewContextIds](const DenseSet<uint32_t> &ContextIds) {
1027	DenseSet<uint32_t> NewIds;
1028	for (auto Id : ContextIds)
1029	if (auto NewId = OldToNewContextIds.find(Val: Id);
1030	NewId != OldToNewContextIds.end())
1031	NewIds.insert(I: NewId ->second.begin(), E: NewId ->second.end());
1032	return NewIds;
1033	};
1034
1035	// Recursively update context ids sets along caller edges.
1036	auto UpdateCallers = [&](ContextNode *Node,
1037	DenseSet<const ContextEdge *> &Visited,
1038	auto &&UpdateCallers) -> void {
1039	for (const auto &Edge : Node->CallerEdges) {
1040	auto Inserted = Visited.insert(Edge.get());
1041	if (!Inserted.second)
1042	continue;
1043	ContextNode *NextNode = Edge->Caller;
1044	DenseSet<uint32_t> NewIdsToAdd = GetNewIds(Edge->getContextIds());
1045	// Only need to recursively iterate to NextNode via this caller edge if
1046	// it resulted in any added ids to NextNode.
1047	if (!NewIdsToAdd.empty()) {
1048	Edge->getContextIds().insert(NewIdsToAdd.begin(), NewIdsToAdd.end());
1049	NextNode->ContextIds.insert(NewIdsToAdd.begin(), NewIdsToAdd.end());
1050	UpdateCallers(NextNode, Visited, UpdateCallers);
1051	}
1052	}
1053	};
1054
1055	DenseSet<const ContextEdge *> Visited;
1056	for (auto &Entry : AllocationCallToContextNodeMap) {
1057	auto *Node = Entry.second;
1058	// Update ids on the allocation nodes before calling the recursive
1059	// update along caller edges, since this simplifies the logic during
1060	// that traversal.
1061	DenseSet<uint32_t> NewIdsToAdd = GetNewIds(Node->ContextIds);
1062	Node->ContextIds.insert(NewIdsToAdd.begin(), NewIdsToAdd.end());
1063	UpdateCallers(Node, Visited, UpdateCallers);
1064	}
1065	}
1066
1067	template <typename DerivedCCG, typename FuncTy, typename CallTy>
1068	void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::connectNewNode(
1069	ContextNode NewNode, ContextNode OrigNode, bool TowardsCallee) {
1070	// Make a copy of the context ids, since this will be adjusted below as they
1071	// are moved.
1072	DenseSet<uint32_t> RemainingContextIds = NewNode->ContextIds;
1073	auto &OrigEdges =
1074	TowardsCallee ? OrigNode->CalleeEdges : OrigNode->CallerEdges;
1075	// Increment iterator in loop so that we can remove edges as needed.
1076	for (auto EI = OrigEdges.begin(); EI != OrigEdges.end();) {
1077	auto Edge = *EI;
1078	// Remove any matching context ids from Edge, return set that were found and
1079	// removed, these are the new edge's context ids. Also update the remaining
1080	// (not found ids).
1081	DenseSet<uint32_t> NewEdgeContextIds, NotFoundContextIds;
1082	set_subtract(Edge->getContextIds(), RemainingContextIds, NewEdgeContextIds,
1083	NotFoundContextIds);
1084	RemainingContextIds.swap(RHS&: NotFoundContextIds);
1085	// If no matching context ids for this edge, skip it.
1086	if (NewEdgeContextIds.empty()) {
1087	++EI;
1088	continue;
1089	}
1090	if (TowardsCallee) {
1091	auto NewEdge = std::make_shared<ContextEdge>(
1092	Edge->Callee, NewNode, computeAllocType(ContextIds&: NewEdgeContextIds),
1093	NewEdgeContextIds);
1094	NewNode->CalleeEdges.push_back(NewEdge);
1095	NewEdge->Callee->CallerEdges.push_back(NewEdge);
1096	} else {
1097	auto NewEdge = std::make_shared<ContextEdge>(
1098	NewNode, Edge->Caller, computeAllocType(ContextIds&: NewEdgeContextIds),
1099	NewEdgeContextIds);
1100	NewNode->CallerEdges.push_back(NewEdge);
1101	NewEdge->Caller->CalleeEdges.push_back(NewEdge);
1102	}
1103	// Remove old edge if context ids empty.
1104	if (Edge->getContextIds().empty()) {
1105	if (TowardsCallee) {
1106	Edge->Callee->eraseCallerEdge(Edge.get());
1107	EI = OrigNode->CalleeEdges.erase(EI);
1108	} else {
1109	Edge->Caller->eraseCalleeEdge(Edge.get());
1110	EI = OrigNode->CallerEdges.erase(EI);
1111	}
1112	continue;
1113	}
1114	++EI;
1115	}
1116	}
1117
1118	template <typename DerivedCCG, typename FuncTy, typename CallTy>
1119	void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::
1120	assignStackNodesPostOrder(ContextNode *Node,
1121	DenseSet<const ContextNode *> &Visited,
1122	DenseMap<uint64_t, std::vector<CallContextInfo>>
1123	&StackIdToMatchingCalls) {
1124	auto Inserted = Visited.insert(Node);
1125	if (!Inserted.second)
1126	return;
1127	// Post order traversal. Iterate over a copy since we may add nodes and
1128	// therefore new callers during the recursive call, invalidating any
1129	// iterator over the original edge vector. We don't need to process these
1130	// new nodes as they were already processed on creation.
1131	auto CallerEdges = Node->CallerEdges;
1132	for (auto &Edge : CallerEdges) {
1133	// Skip any that have been removed during the recursion.
1134	if (!Edge)
1135	continue;
1136	assignStackNodesPostOrder(Node: Edge->Caller, Visited, StackIdToMatchingCalls);
1137	}
1138
1139	// If this node's stack id is in the map, update the graph to contain new
1140	// nodes representing any inlining at interior callsites. Note we move the
1141	// associated context ids over to the new nodes.
1142
1143	// Ignore this node if it is for an allocation or we didn't record any
1144	// stack id lists ending at it.
1145	if (Node->IsAllocation \|\|
1146	!StackIdToMatchingCalls.count(Node->OrigStackOrAllocId))
1147	return;
1148
1149	auto &Calls = StackIdToMatchingCalls[Node->OrigStackOrAllocId];
1150	// Handle the simple case first. A single call with a single stack id.
1151	// In this case there is no need to create any new context nodes, simply
1152	// assign the context node for stack id to this Call.
1153	if (Calls.size() == `1`) {
1154	auto &[Call, Ids, Func, SavedContextIds] = Calls[`0`];
1155	if (Ids.size() == `1`) {
1156	assert(SavedContextIds.empty());
1157	// It should be this Node
1158	assert(Node == getNodeForStackId(Ids[`0`]));
1159	if (Node->Recursive)
1160	return;
1161	Node->setCall(Call);
1162	NonAllocationCallToContextNodeMap[Call] = Node;
1163	NodeToCallingFunc[Node] = Func;
1164	return;
1165	}
1166	}
1167
1168	// Find the node for the last stack id, which should be the same
1169	// across all calls recorded for this id, and is this node's id.
1170	uint64_t LastId = Node->OrigStackOrAllocId;
1171	ContextNode *LastNode = getNodeForStackId(StackId: LastId);
1172	// We should only have kept stack ids that had nodes.
1173	assert(LastNode);
1174
1175	for (unsigned I = `0`; I < Calls.size(); I++) {
1176	auto &[Call, Ids, Func, SavedContextIds] = Calls[I];
1177	// Skip any for which we didn't assign any ids, these don't get a node in
1178	// the graph.
1179	if (SavedContextIds.empty())
1180	continue;
1181
1182	assert(LastId == Ids.back());
1183
1184	ContextNode *FirstNode = getNodeForStackId(StackId: Ids[`0`]);
1185	assert(FirstNode);
1186
1187	// Recompute the context ids for this stack id sequence (the
1188	// intersection of the context ids of the corresponding nodes).
1189	// Start with the ids we saved in the map for this call, which could be
1190	// duplicated context ids. We have to recompute as we might have overlap
1191	// overlap between the saved context ids for different last nodes, and
1192	// removed them already during the post order traversal.
1193	set_intersect(SavedContextIds, FirstNode->ContextIds);
1194	ContextNode PrevNode = nullptr*;
1195	for (auto Id : Ids) {
1196	ContextNode *CurNode = getNodeForStackId(StackId: Id);
1197	// We should only have kept stack ids that had nodes and weren't
1198	// recursive.
1199	assert(CurNode);
1200	assert(!CurNode->Recursive);
1201	if (!PrevNode) {
1202	PrevNode = CurNode;
1203	continue;
1204	}
1205	auto *Edge = CurNode->findEdgeFromCallee(PrevNode);
1206	if (!Edge) {
1207	SavedContextIds.clear();
1208	break;
1209	}
1210	PrevNode = CurNode;
1211	set_intersect(SavedContextIds, Edge->getContextIds());
1212
1213	// If we now have no context ids for clone, skip this call.
1214	if (SavedContextIds.empty())
1215	break;
1216	}
1217	if (SavedContextIds.empty())
1218	continue;
1219
1220	// Create new context node.
1221	NodeOwner.push_back(
1222	std::make_unique<ContextNode>(/IsAllocation=/false, Call));
1223	ContextNode *NewNode = NodeOwner.back().get();
1224	NodeToCallingFunc[NewNode] = Func;
1225	NonAllocationCallToContextNodeMap[Call] = NewNode;
1226	NewNode->ContextIds = SavedContextIds;
1227	NewNode->AllocTypes = computeAllocType(ContextIds&: NewNode->ContextIds);
1228
1229	// Connect to callees of innermost stack frame in inlined call chain.
1230	// This updates context ids for FirstNode's callee's to reflect those
1231	// moved to NewNode.
1232	connectNewNode(NewNode, OrigNode: FirstNode, /TowardsCallee=/true);
1233
1234	// Connect to callers of outermost stack frame in inlined call chain.
1235	// This updates context ids for FirstNode's caller's to reflect those
1236	// moved to NewNode.
1237	connectNewNode(NewNode, OrigNode: LastNode, /TowardsCallee=/false);
1238
1239	// Now we need to remove context ids from edges/nodes between First and
1240	// Last Node.
1241	PrevNode = nullptr;
1242	for (auto Id : Ids) {
1243	ContextNode *CurNode = getNodeForStackId(StackId: Id);
1244	// We should only have kept stack ids that had nodes.
1245	assert(CurNode);
1246
1247	// Remove the context ids moved to NewNode from CurNode, and the
1248	// edge from the prior node.
1249	set_subtract(CurNode->ContextIds, NewNode->ContextIds);
1250	if (PrevNode) {
1251	auto *PrevEdge = CurNode->findEdgeFromCallee(PrevNode);
1252	assert(PrevEdge);
1253	set_subtract(PrevEdge->getContextIds(), NewNode->ContextIds);
1254	if (PrevEdge->getContextIds().empty()) {
1255	PrevNode->eraseCallerEdge(PrevEdge);
1256	CurNode->eraseCalleeEdge(PrevEdge);
1257	}
1258	}
1259	PrevNode = CurNode;
1260	}
1261	}
1262	}
1263
1264	template <typename DerivedCCG, typename FuncTy, typename CallTy>
1265	void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::updateStackNodes() {
1266	// Map of stack id to all calls with that as the last (outermost caller)
1267	// callsite id that has a context node (some might not due to pruning
1268	// performed during matching of the allocation profile contexts).
1269	// The CallContextInfo contains the Call and a list of its stack ids with
1270	// ContextNodes, the function containing Call, and the set of context ids
1271	// the analysis will eventually identify for use in any new node created
1272	// for that callsite.
1273	DenseMap<uint64_t, std::vector<CallContextInfo>> StackIdToMatchingCalls;
1274	for (auto &[Func, CallsWithMetadata] : FuncToCallsWithMetadata) {
1275	for (auto &Call : CallsWithMetadata) {
1276	// Ignore allocations, already handled.
1277	if (AllocationCallToContextNodeMap.count(Call))
1278	continue;
1279	auto StackIdsWithContextNodes =
1280	getStackIdsWithContextNodesForCall(Call: Call.call());
1281	// If there were no nodes created for MIBs on allocs (maybe this was in
1282	// the unambiguous part of the MIB stack that was pruned), ignore.
1283	if (StackIdsWithContextNodes.empty())
1284	continue;
1285	// Otherwise, record this Call along with the list of ids for the last
1286	// (outermost caller) stack id with a node.
1287	StackIdToMatchingCalls[StackIdsWithContextNodes.back()].push_back(
1288	{Call.call(), StackIdsWithContextNodes, Func, {}});
1289	}
1290	}
1291
1292	// First make a pass through all stack ids that correspond to a call,
1293	// as identified in the above loop. Compute the context ids corresponding to
1294	// each of these calls when they correspond to multiple stack ids due to
1295	// due to inlining. Perform any duplication of context ids required when
1296	// there is more than one call with the same stack ids. Their (possibly newly
1297	// duplicated) context ids are saved in the StackIdToMatchingCalls map.
1298	DenseMap<uint32_t, DenseSet<uint32_t>> OldToNewContextIds;
1299	for (auto &It : StackIdToMatchingCalls) {
1300	auto &Calls = It.getSecond();
1301	// Skip single calls with a single stack id. These don't need a new node.
1302	if (Calls.size() == `1`) {
1303	auto &Ids = std::get<`1`>(Calls[`0`]);
1304	if (Ids.size() == `1`)
1305	continue;
1306	}
1307	// In order to do the best and maximal matching of inlined calls to context
1308	// node sequences we will sort the vectors of stack ids in descending order
1309	// of length, and within each length, lexicographically by stack id. The
1310	// latter is so that we can specially handle calls that have identical stack
1311	// id sequences (either due to cloning or artificially because of the MIB
1312	// context pruning).
1313	std::stable_sort(Calls.begin(), Calls.end(),
1314	[](const CallContextInfo &A, const CallContextInfo &B) {
1315	auto &IdsA = std::get<`1`>(A);
1316	auto &IdsB = std::get<`1`>(B);
1317	return IdsA.size() > IdsB.size() \|\|
1318	(IdsA.size() == IdsB.size() && IdsA < IdsB);
1319	});
1320
1321	// Find the node for the last stack id, which should be the same
1322	// across all calls recorded for this id, and is the id for this
1323	// entry in the StackIdToMatchingCalls map.
1324	uint64_t LastId = It.getFirst();
1325	ContextNode *LastNode = getNodeForStackId(StackId: LastId);
1326	// We should only have kept stack ids that had nodes.
1327	assert(LastNode);
1328
1329	if (LastNode->Recursive)
1330	continue;
1331
1332	// Initialize the context ids with the last node's. We will subsequently
1333	// refine the context ids by computing the intersection along all edges.
1334	DenseSet<uint32_t> LastNodeContextIds = LastNode->ContextIds;
1335	assert(!LastNodeContextIds.empty());
1336
1337	for (unsigned I = `0`; I < Calls.size(); I++) {
1338	auto &[Call, Ids, Func, SavedContextIds] = Calls[I];
1339	assert(SavedContextIds.empty());
1340	assert(LastId == Ids.back());
1341
1342	// First compute the context ids for this stack id sequence (the
1343	// intersection of the context ids of the corresponding nodes).
1344	// Start with the remaining saved ids for the last node.
1345	assert(!LastNodeContextIds.empty());
1346	DenseSet<uint32_t> StackSequenceContextIds = LastNodeContextIds;
1347
1348	ContextNode *PrevNode = LastNode;
1349	ContextNode *CurNode = LastNode;
1350	bool Skip = false;
1351
1352	// Iterate backwards through the stack Ids, starting after the last Id
1353	// in the list, which was handled once outside for all Calls.
1354	for (auto IdIter = Ids.rbegin() + `1`; IdIter != Ids.rend(); IdIter++) {
1355	auto Id = *IdIter;
1356	CurNode = getNodeForStackId(StackId: Id);
1357	// We should only have kept stack ids that had nodes.
1358	assert(CurNode);
1359
1360	if (CurNode->Recursive) {
1361	Skip = true;
1362	break;
1363	}
1364
1365	auto *Edge = CurNode->findEdgeFromCaller(PrevNode);
1366	// If there is no edge then the nodes belong to different MIB contexts,
1367	// and we should skip this inlined context sequence. For example, this
1368	// particular inlined context may include stack ids A->B, and we may
1369	// indeed have nodes for both A and B, but it is possible that they were
1370	// never profiled in sequence in a single MIB for any allocation (i.e.
1371	// we might have profiled an allocation that involves the callsite A,
1372	// but through a different one of its callee callsites, and we might
1373	// have profiled an allocation that involves callsite B, but reached
1374	// from a different caller callsite).
1375	if (!Edge) {
1376	Skip = true;
1377	break;
1378	}
1379	PrevNode = CurNode;
1380
1381	// Update the context ids, which is the intersection of the ids along
1382	// all edges in the sequence.
1383	set_intersect(StackSequenceContextIds, Edge->getContextIds());
1384
1385	// If we now have no context ids for clone, skip this call.
1386	if (StackSequenceContextIds.empty()) {
1387	Skip = true;
1388	break;
1389	}
1390	}
1391	if (Skip)
1392	continue;
1393
1394	// If some of this call's stack ids did not have corresponding nodes (due
1395	// to pruning), don't include any context ids for contexts that extend
1396	// beyond these nodes. Otherwise we would be matching part of unrelated /
1397	// not fully matching stack contexts. To do this, subtract any context ids
1398	// found in caller nodes of the last node found above.
1399	if (Ids.back() != getLastStackId(Call)) {
1400	for (const auto &PE : LastNode->CallerEdges) {
1401	set_subtract(StackSequenceContextIds, PE->getContextIds());
1402	if (StackSequenceContextIds.empty())
1403	break;
1404	}
1405	// If we now have no context ids for clone, skip this call.
1406	if (StackSequenceContextIds.empty())
1407	continue;
1408	}
1409
1410	// Check if the next set of stack ids is the same (since the Calls vector
1411	// of tuples is sorted by the stack ids we can just look at the next one).
1412	bool DuplicateContextIds = false;
1413	if (I + `1` < Calls.size()) {
1414	auto NextIds = std::get<`1`>(Calls[I + `1`]);
1415	DuplicateContextIds = Ids == NextIds;
1416	}
1417
1418	// If we don't have duplicate context ids, then we can assign all the
1419	// context ids computed for the original node sequence to this call.
1420	// If there are duplicate calls with the same stack ids then we synthesize
1421	// new context ids that are duplicates of the originals. These are
1422	// assigned to SavedContextIds, which is a reference into the map entry
1423	// for this call, allowing us to access these ids later on.
1424	OldToNewContextIds.reserve(NumEntries: OldToNewContextIds.size() +
1425	StackSequenceContextIds.size());
1426	SavedContextIds =
1427	DuplicateContextIds
1428	? duplicateContextIds(StackSequenceContextIds, OldToNewContextIds)
1429	: StackSequenceContextIds;
1430	assert(!SavedContextIds.empty());
1431
1432	if (!DuplicateContextIds) {
1433	// Update saved last node's context ids to remove those that are
1434	// assigned to other calls, so that it is ready for the next call at
1435	// this stack id.
1436	set_subtract(S1&: LastNodeContextIds, S2: StackSequenceContextIds);
1437	if (LastNodeContextIds.empty())
1438	break;
1439	}
1440	}
1441	}
1442
1443	// Propagate the duplicate context ids over the graph.
1444	propagateDuplicateContextIds(OldToNewContextIds);
1445
1446	if (VerifyCCG)
1447	check();
1448
1449	// Now perform a post-order traversal over the graph, starting with the
1450	// allocation nodes, essentially processing nodes from callers to callees.
1451	// For any that contains an id in the map, update the graph to contain new
1452	// nodes representing any inlining at interior callsites. Note we move the
1453	// associated context ids over to the new nodes.
1454	DenseSet<const ContextNode *> Visited;
1455	for (auto &Entry : AllocationCallToContextNodeMap)
1456	assignStackNodesPostOrder(Node: Entry.second, Visited, StackIdToMatchingCalls);
1457	}
1458
1459	uint64_t ModuleCallsiteContextGraph::getLastStackId(Instruction *Call) {
1460	CallStack<MDNode, MDNode::op_iterator> CallsiteContext(
1461	Call->getMetadata(KindID: LLVMContext::MD_callsite));
1462	return CallsiteContext.back();
1463	}
1464
1465	uint64_t IndexCallsiteContextGraph::getLastStackId(IndexCall &Call) {
1466	assert(isa<CallsiteInfo *>(Call.getBase()));
1467	CallStack<CallsiteInfo, SmallVector<unsigned>::const_iterator>
1468	CallsiteContext(dyn_cast_if_present<CallsiteInfo *>(Val: Call.getBase()));
1469	// Need to convert index into stack id.
1470	return Index.getStackIdAtIndex(Index: CallsiteContext.back());
1471	}
1472
1473	static const std::string MemProfCloneSuffix = ".memprof.";
1474
1475	static std::string getMemProfFuncName(Twine Base, unsigned CloneNo) {
1476	// We use CloneNo == 0 to refer to the original version, which doesn't get
1477	// renamed with a suffix.
1478	if (!CloneNo)
1479	return Base.str();
1480	return (Base + MemProfCloneSuffix + Twine (CloneNo)).str();
1481	}
1482
1483	std::string ModuleCallsiteContextGraph::getLabel(const Function *Func,
1484	const Instruction *Call,
1485	unsigned CloneNo) const {
1486	return (Twine (Call->getFunction()->getName()) + " -> " +
1487	cast<CallBase>(Val: Call)->getCalledFunction()->getName())
1488	.str();
1489	}
1490
1491	std::string IndexCallsiteContextGraph::getLabel(const FunctionSummary *Func,
1492	const IndexCall &Call,
1493	unsigned CloneNo) const {
1494	auto VI = FSToVIMap.find(x: Func);
1495	assert(VI != FSToVIMap.end());
1496	if (isa<AllocInfo *>(Val: Call.getBase()))
1497	return (VI ->second.name() + " -> alloc").str();
1498	else {
1499	auto Callsite = dyn_cast_if_present<CallsiteInfo >(Val: Call.getBase());
1500	return (VI ->second.name() + " -> " +
1501	getMemProfFuncName(Base: Callsite->Callee.name(),
1502	CloneNo: Callsite->Clones [CloneNo]))
1503	.str();
1504	}
1505	}
1506
1507	std::vector<uint64_t>
1508	ModuleCallsiteContextGraph::getStackIdsWithContextNodesForCall(
1509	Instruction *Call) {
1510	CallStack<MDNode, MDNode::op_iterator> CallsiteContext(
1511	Call->getMetadata(KindID: LLVMContext::MD_callsite));
1512	return getStackIdsWithContextNodes<MDNode, MDNode::op_iterator>(
1513	CallsiteContext);
1514	}
1515
1516	std::vector<uint64_t>
1517	IndexCallsiteContextGraph::getStackIdsWithContextNodesForCall(IndexCall &Call) {
1518	assert(isa<CallsiteInfo *>(Call.getBase()));
1519	CallStack<CallsiteInfo, SmallVector<unsigned>::const_iterator>
1520	CallsiteContext(dyn_cast_if_present<CallsiteInfo *>(Val: Call.getBase()));
1521	return getStackIdsWithContextNodes<CallsiteInfo,
1522	SmallVector<unsigned>::const_iterator>(
1523	CallsiteContext);
1524	}
1525
1526	template <typename DerivedCCG, typename FuncTy, typename CallTy>
1527	template <class NodeT, class IteratorT>
1528	std::vector<uint64_t>
1529	CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::getStackIdsWithContextNodes(
1530	CallStack<NodeT, IteratorT> &CallsiteContext) {
1531	std::vector<uint64_t> StackIds;
1532	for (auto IdOrIndex : CallsiteContext) {
1533	auto StackId = getStackId(IdOrIndex);
1534	ContextNode *Node = getNodeForStackId(StackId);
1535	if (!Node)
1536	break;
1537	StackIds.push_back(StackId);
1538	}
1539	return StackIds;
1540	}
1541
1542	ModuleCallsiteContextGraph::ModuleCallsiteContextGraph(
1543	Module &M,
1544	llvm::function_ref<OptimizationRemarkEmitter &(Function *)> OREGetter)
1545	: Mod(M), OREGetter (OREGetter) {
1546	for (auto &F : M) {
1547	std::vector<CallInfo> CallsWithMetadata;
1548	for (auto &BB : F) {
1549	for (auto &I : BB) {
1550	if (!isa<CallBase>(Val: I))
1551	continue;
1552	if (auto *MemProfMD = I.getMetadata(KindID: LLVMContext::MD_memprof)) {
1553	CallsWithMetadata.push_back(x: &I);
1554	auto *AllocNode = addAllocNode(Call: &I, F: &F);
1555	auto *CallsiteMD = I.getMetadata(KindID: LLVMContext::MD_callsite);
1556	assert(CallsiteMD);
1557	CallStack<MDNode, MDNode::op_iterator> CallsiteContext(CallsiteMD);
1558	// Add all of the MIBs and their stack nodes.
1559	for (auto &MDOp : MemProfMD->operands()) {
1560	auto MIBMD = cast<const* MDNode>(Val: MDOp);
1561	MDNode *StackNode = getMIBStackNode(MIB: MIBMD);
1562	assert(StackNode);
1563	CallStack<MDNode, MDNode::op_iterator> StackContext(StackNode);
1564	addStackNodesForMIB<MDNode, MDNode::op_iterator>(
1565	AllocNode, StackContext, CallsiteContext,
1566	AllocType: getMIBAllocType(MIB: MIBMD));
1567	}
1568	assert(AllocNode->AllocTypes != (uint8_t)AllocationType::None);
1569	// Memprof and callsite metadata on memory allocations no longer
1570	// needed.
1571	I.setMetadata(KindID: LLVMContext::MD_memprof, Node: nullptr);
1572	I.setMetadata(KindID: LLVMContext::MD_callsite, Node: nullptr);
1573	}
1574	// For callsite metadata, add to list for this function for later use.
1575	else if (I.getMetadata(KindID: LLVMContext::MD_callsite))
1576	CallsWithMetadata.push_back(x: &I);
1577	}
1578	}
1579	if (!CallsWithMetadata.empty())
1580	FuncToCallsWithMetadata [&F] = CallsWithMetadata;
1581	}
1582
1583	if (DumpCCG) {
1584	dbgs() << "CCG before updating call stack chains:\n";
1585	dbgs() << *this;
1586	}
1587
1588	if (ExportToDot)
1589	exportToDot(Label: "prestackupdate");
1590
1591	updateStackNodes();
1592
1593	handleCallsitesWithMultipleTargets();
1594
1595	// Strip off remaining callsite metadata, no longer needed.
1596	for (auto &FuncEntry : FuncToCallsWithMetadata)
1597	for (auto &Call : FuncEntry.second)
1598	Call.call()->setMetadata(KindID: LLVMContext::MD_callsite, Node: nullptr);
1599	}
1600
1601	IndexCallsiteContextGraph::IndexCallsiteContextGraph(
1602	ModuleSummaryIndex &Index,
1603	llvm::function_ref<bool(GlobalValue::GUID, const GlobalValueSummary *)>
1604	isPrevailing)
1605	: Index(Index), isPrevailing (isPrevailing) {
1606	for (auto &I : Index) {
1607	auto VI = Index.getValueInfo(R: I);
1608	for (auto &S : VI.getSummaryList()) {
1609	// We should only add the prevailing nodes. Otherwise we may try to clone
1610	// in a weak copy that won't be linked (and may be different than the
1611	// prevailing version).
1612	// We only keep the memprof summary on the prevailing copy now when
1613	// building the combined index, as a space optimization, however don't
1614	// rely on this optimization. The linker doesn't resolve local linkage
1615	// values so don't check whether those are prevailing.
1616	if (!GlobalValue::isLocalLinkage(Linkage: S ->linkage()) &&
1617	!isPrevailing (VI.getGUID(), S.get()))
1618	continue;
1619	auto *FS = dyn_cast<FunctionSummary>(Val: S.get());
1620	if (!FS)
1621	continue;
1622	std::vector<CallInfo> CallsWithMetadata;
1623	if (!FS->allocs().empty()) {
1624	for (auto &AN : FS->mutableAllocs()) {
1625	// This can happen because of recursion elimination handling that
1626	// currently exists in ModuleSummaryAnalysis. Skip these for now.
1627	// We still added them to the summary because we need to be able to
1628	// correlate properly in applyImport in the backends.
1629	if (AN.MIBs.empty())
1630	continue;
1631	CallsWithMetadata.push_back(x: {&AN});
1632	auto *AllocNode = addAllocNode(Call: {&AN}, F: FS);
1633	// Pass an empty CallStack to the CallsiteContext (second)
1634	// parameter, since for ThinLTO we already collapsed out the inlined
1635	// stack ids on the allocation call during ModuleSummaryAnalysis.
1636	CallStack<MIBInfo, SmallVector<unsigned>::const_iterator>
1637	EmptyContext;
1638	// Now add all of the MIBs and their stack nodes.
1639	for (auto &MIB : AN.MIBs) {
1640	CallStack<MIBInfo, SmallVector<unsigned>::const_iterator>
1641	StackContext(&MIB);
1642	addStackNodesForMIB<MIBInfo, SmallVector<unsigned>::const_iterator>(
1643	AllocNode, StackContext, CallsiteContext&: EmptyContext, AllocType: MIB.AllocType);
1644	}
1645	assert(AllocNode->AllocTypes != (uint8_t)AllocationType::None);
1646	// Initialize version 0 on the summary alloc node to the current alloc
1647	// type, unless it has both types in which case make it default, so
1648	// that in the case where we aren't able to clone the original version
1649	// always ends up with the default allocation behavior.
1650	AN.Versions [`0`] = (uint8_t)allocTypeToUse(AllocTypes: AllocNode->AllocTypes);
1651	}
1652	}
1653	// For callsite metadata, add to list for this function for later use.
1654	if (!FS->callsites().empty())
1655	for (auto &SN : FS->mutableCallsites())
1656	CallsWithMetadata.push_back(x: {&SN});
1657
1658	if (!CallsWithMetadata.empty())
1659	FuncToCallsWithMetadata [FS] = CallsWithMetadata;
1660
1661	if (!FS->allocs().empty() \|\| !FS->callsites().empty())
1662	FSToVIMap [FS] = VI;
1663	}
1664	}
1665
1666	if (DumpCCG) {
1667	dbgs() << "CCG before updating call stack chains:\n";
1668	dbgs() << *this;
1669	}
1670
1671	if (ExportToDot)
1672	exportToDot(Label: "prestackupdate");
1673
1674	updateStackNodes();
1675
1676	handleCallsitesWithMultipleTargets();
1677	}
1678
1679	template <typename DerivedCCG, typename FuncTy, typename CallTy>
1680	void CallsiteContextGraph<DerivedCCG, FuncTy,
1681	CallTy>::handleCallsitesWithMultipleTargets() {
1682	// Look for and workaround callsites that call multiple functions.
1683	// This can happen for indirect calls, which needs better handling, and in
1684	// more rare cases (e.g. macro expansion).
1685	// TODO: To fix this for indirect calls we will want to perform speculative
1686	// devirtualization using either the normal PGO info with ICP, or using the
1687	// information in the profiled MemProf contexts. We can do this prior to
1688	// this transformation for regular LTO, and for ThinLTO we can simulate that
1689	// effect in the summary and perform the actual speculative devirtualization
1690	// while cloning in the ThinLTO backend.
1691
1692	// Keep track of the new nodes synthesized for discovered tail calls missing
1693	// from the profiled contexts.
1694	MapVector<CallInfo, ContextNode *> TailCallToContextNodeMap;
1695
1696	for (auto Entry = NonAllocationCallToContextNodeMap.begin();
1697	Entry != NonAllocationCallToContextNodeMap.end();) {
1698	auto *Node = Entry->second;
1699	assert(Node->Clones.empty());
1700	// Check all node callees and see if in the same function.
1701	bool Removed = false;
1702	auto Call = Node->Call.call();
1703	for (auto EI = Node->CalleeEdges.begin(); EI != Node->CalleeEdges.end();) {
1704	auto Edge = *EI;
1705	if (!Edge->Callee->hasCall()) {
1706	++EI;
1707	continue;
1708	}
1709	assert(NodeToCallingFunc.count(Edge->Callee));
1710	// Check if the called function matches that of the callee node.
1711	if (calleesMatch(Call, EI, TailCallToContextNodeMap))
1712	continue;
1713	RemovedEdgesWithMismatchedCallees ++;
1714	// Work around by setting Node to have a null call, so it gets
1715	// skipped during cloning. Otherwise assignFunctions will assert
1716	// because its data structures are not designed to handle this case.
1717	Entry = NonAllocationCallToContextNodeMap.erase(Entry);
1718	Node->setCall(CallInfo());
1719	Removed = true;
1720	break;
1721	}
1722	if (!Removed)
1723	Entry++;
1724	}
1725
1726	// Add the new nodes after the above loop so that the iteration is not
1727	// invalidated.
1728	for (auto &[Call, Node] : TailCallToContextNodeMap)
1729	NonAllocationCallToContextNodeMap[Call] = Node;
1730	}
1731
1732	uint64_t ModuleCallsiteContextGraph::getStackId(uint64_t IdOrIndex) const {
1733	// In the Module (IR) case this is already the Id.
1734	return IdOrIndex;
1735	}
1736
1737	uint64_t IndexCallsiteContextGraph::getStackId(uint64_t IdOrIndex) const {
1738	// In the Index case this is an index into the stack id list in the summary
1739	// index, convert it to an Id.
1740	return Index.getStackIdAtIndex(Index: IdOrIndex);
1741	}
1742
1743	template <typename DerivedCCG, typename FuncTy, typename CallTy>
1744	bool CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::calleesMatch(
1745	CallTy Call, EdgeIter &EI,
1746	MapVector<CallInfo, ContextNode *> &TailCallToContextNodeMap) {
1747	auto Edge = *EI;
1748	const FuncTy *ProfiledCalleeFunc = NodeToCallingFunc[Edge->Callee];
1749	const FuncTy *CallerFunc = NodeToCallingFunc[Edge->Caller];
1750	// Will be populated in order of callee to caller if we find a chain of tail
1751	// calls between the profiled caller and callee.
1752	std::vector<std::pair<CallTy, FuncTy *>> FoundCalleeChain;
1753	if (!calleeMatchesFunc(Call, Func: ProfiledCalleeFunc, CallerFunc,
1754	FoundCalleeChain)) {
1755	++EI;
1756	return false;
1757	}
1758
1759	// The usual case where the profiled callee matches that of the IR/summary.
1760	if (FoundCalleeChain.empty()) {
1761	++EI;
1762	return true;
1763	}
1764
1765	auto AddEdge = [Edge, &EI](ContextNode Caller, ContextNode Callee) {
1766	auto *CurEdge = Callee->findEdgeFromCaller(Caller);
1767	// If there is already an edge between these nodes, simply update it and
1768	// return.
1769	if (CurEdge) {
1770	CurEdge->ContextIds.insert(Edge->ContextIds.begin(),
1771	Edge->ContextIds.end());
1772	CurEdge->AllocTypes \|= Edge->AllocTypes;
1773	return;
1774	}
1775	// Otherwise, create a new edge and insert it into the caller and callee
1776	// lists.
1777	auto NewEdge = std::make_shared<ContextEdge>(
1778	Callee, Caller, Edge->AllocTypes, Edge->ContextIds);
1779	Callee->CallerEdges.push_back(NewEdge);
1780	if (Caller == Edge->Caller) {
1781	// If we are inserting the new edge into the current edge's caller, insert
1782	// the new edge before the current iterator position, and then increment
1783	// back to the current edge.
1784	EI = Caller->CalleeEdges.insert(EI, NewEdge);
1785	++EI;
1786	assert(*EI == Edge &&
1787	"Iterator position not restored after insert and increment");
1788	} else
1789	Caller->CalleeEdges.push_back(NewEdge);
1790	};
1791
1792	// Create new nodes for each found callee and connect in between the profiled
1793	// caller and callee.
1794	auto *CurCalleeNode = Edge->Callee;
1795	for (auto &[NewCall, Func] : FoundCalleeChain) {
1796	ContextNode NewNode = nullptr*;
1797	// First check if we have already synthesized a node for this tail call.
1798	if (TailCallToContextNodeMap.count(NewCall)) {
1799	NewNode = TailCallToContextNodeMap[NewCall];
1800	NewNode->ContextIds.insert(Edge->ContextIds.begin(),
1801	Edge->ContextIds.end());
1802	NewNode->AllocTypes \|= Edge->AllocTypes;
1803	} else {
1804	FuncToCallsWithMetadata[Func].push_back({NewCall});
1805	// Create Node and record node info.
1806	NodeOwner.push_back(
1807	std::make_unique<ContextNode>(/IsAllocation=/false, NewCall));
1808	NewNode = NodeOwner.back().get();
1809	NodeToCallingFunc[NewNode] = Func;
1810	TailCallToContextNodeMap[NewCall] = NewNode;
1811	NewNode->ContextIds = Edge->ContextIds;
1812	NewNode->AllocTypes = Edge->AllocTypes;
1813	}
1814
1815	// Hook up node to its callee node
1816	AddEdge(NewNode, CurCalleeNode);
1817
1818	CurCalleeNode = NewNode;
1819	}
1820
1821	// Hook up edge's original caller to new callee node.
1822	AddEdge(Edge->Caller, CurCalleeNode);
1823
1824	// Remove old edge
1825	Edge->Callee->eraseCallerEdge(Edge.get());
1826	EI = Edge->Caller->CalleeEdges.erase(EI);
1827
1828	return true;
1829	}
1830
1831	bool ModuleCallsiteContextGraph::findProfiledCalleeThroughTailCalls(
1832	const Function ProfiledCallee, Value CurCallee, unsigned Depth,
1833	std::vector<std::pair<Instruction , Function >> &FoundCalleeChain,
1834	bool &FoundMultipleCalleeChains) {
1835	// Stop recursive search if we have already explored the maximum specified
1836	// depth.
1837	if (Depth > TailCallSearchDepth)
1838	return false;
1839
1840	auto SaveCallsiteInfo = [&](Instruction Callsite, Function F) {
1841	FoundCalleeChain.push_back(x: {Callsite, F});
1842	};
1843
1844	auto *CalleeFunc = dyn_cast<Function>(Val: CurCallee);
1845	if (!CalleeFunc) {
1846	auto *Alias = dyn_cast<GlobalAlias>(Val: CurCallee);
1847	assert(Alias);
1848	CalleeFunc = dyn_cast<Function>(Val: Alias->getAliasee());
1849	assert(CalleeFunc);
1850	}
1851
1852	// Look for tail calls in this function, and check if they either call the
1853	// profiled callee directly, or indirectly (via a recursive search).
1854	// Only succeed if there is a single unique tail call chain found between the
1855	// profiled caller and callee, otherwise we could perform incorrect cloning.
1856	bool FoundSingleCalleeChain = false;
1857	for (auto &BB : *CalleeFunc) {
1858	for (auto &I : BB) {
1859	auto *CB = dyn_cast<CallBase>(Val: &I);
1860	if (!CB \|\| !CB->isTailCall())
1861	continue;
1862	auto *CalledValue = CB->getCalledOperand();
1863	auto *CalledFunction = CB->getCalledFunction();
1864	if (CalledValue && !CalledFunction) {
1865	CalledValue = CalledValue->stripPointerCasts();
1866	// Stripping pointer casts can reveal a called function.
1867	CalledFunction = dyn_cast<Function>(Val: CalledValue);
1868	}
1869	// Check if this is an alias to a function. If so, get the
1870	// called aliasee for the checks below.
1871	if (auto *GA = dyn_cast<GlobalAlias>(Val: CalledValue)) {
1872	assert(!CalledFunction &&
1873	"Expected null called function in callsite for alias");
1874	CalledFunction = dyn_cast<Function>(Val: GA->getAliaseeObject());
1875	}
1876	if (!CalledFunction)
1877	continue;
1878	if (CalledFunction == ProfiledCallee) {
1879	if (FoundSingleCalleeChain) {
1880	FoundMultipleCalleeChains = true;
1881	return false;
1882	}
1883	FoundSingleCalleeChain = true;
1884	FoundProfiledCalleeCount ++;
1885	FoundProfiledCalleeDepth += Depth;
1886	if (Depth > FoundProfiledCalleeMaxDepth)
1887	FoundProfiledCalleeMaxDepth = Depth;
1888	SaveCallsiteInfo (&I, CalleeFunc);
1889	} else if (findProfiledCalleeThroughTailCalls(
1890	ProfiledCallee, CurCallee: CalledFunction, Depth: Depth + `1`,
1891	FoundCalleeChain, FoundMultipleCalleeChains)) {
1892	if (FoundMultipleCalleeChains)
1893	return false;
1894	if (FoundSingleCalleeChain) {
1895	FoundMultipleCalleeChains = true;
1896	return false;
1897	}
1898	FoundSingleCalleeChain = true;
1899	SaveCallsiteInfo (&I, CalleeFunc);
1900	}
1901	}
1902	}
1903
1904	return FoundSingleCalleeChain;
1905	}
1906
1907	bool ModuleCallsiteContextGraph::calleeMatchesFunc(
1908	Instruction Call, const* Function Func, const* Function *CallerFunc,
1909	std::vector<std::pair<Instruction , Function >> &FoundCalleeChain) {
1910	auto *CB = dyn_cast<CallBase>(Val: Call);
1911	if (!CB->getCalledOperand())
1912	return false;
1913	auto *CalleeVal = CB->getCalledOperand()->stripPointerCasts();
1914	auto *CalleeFunc = dyn_cast<Function>(Val: CalleeVal);
1915	if (CalleeFunc == Func)
1916	return true;
1917	auto *Alias = dyn_cast<GlobalAlias>(Val: CalleeVal);
1918	if (Alias && Alias->getAliasee() == Func)
1919	return true;
1920
1921	// Recursively search for the profiled callee through tail calls starting with
1922	// the actual Callee. The discovered tail call chain is saved in
1923	// FoundCalleeChain, and we will fixup the graph to include these callsites
1924	// after returning.
1925	// FIXME: We will currently redo the same recursive walk if we find the same
1926	// mismatched callee from another callsite. We can improve this with more
1927	// bookkeeping of the created chain of new nodes for each mismatch.
1928	unsigned Depth = `1`;
1929	bool FoundMultipleCalleeChains = false;
1930	if (!findProfiledCalleeThroughTailCalls(ProfiledCallee: Func, CurCallee: CalleeVal, Depth,
1931	FoundCalleeChain,
1932	FoundMultipleCalleeChains)) {
1933	LLVM_DEBUG(dbgs() << "Not found through unique tail call chain: "
1934	<< Func->getName() << " from " << CallerFunc->getName()
1935	<< " that actually called " << CalleeVal->getName()
1936	<< (FoundMultipleCalleeChains
1937	? " (found multiple possible chains)"
1938	: "")
1939	<< "\n");
1940	if (FoundMultipleCalleeChains)
1941	FoundProfiledCalleeNonUniquelyCount ++;
1942	return false;
1943	}
1944
1945	return true;
1946	}
1947
1948	bool IndexCallsiteContextGraph::findProfiledCalleeThroughTailCalls(
1949	ValueInfo ProfiledCallee, ValueInfo CurCallee, unsigned Depth,
1950	std::vector<std::pair<IndexCall, FunctionSummary *>> &FoundCalleeChain,
1951	bool &FoundMultipleCalleeChains) {
1952	// Stop recursive search if we have already explored the maximum specified
1953	// depth.
1954	if (Depth > TailCallSearchDepth)
1955	return false;
1956
1957	auto CreateAndSaveCallsiteInfo = [&](ValueInfo Callee, FunctionSummary *FS) {
1958	// Make a CallsiteInfo for each discovered callee, if one hasn't already
1959	// been synthesized.
1960	if (!FunctionCalleesToSynthesizedCallsiteInfos.count(x: FS) \|\|
1961	!FunctionCalleesToSynthesizedCallsiteInfos [FS].count(x: Callee))
1962	// StackIds is empty (we don't have debug info available in the index for
1963	// these callsites)
1964	FunctionCalleesToSynthesizedCallsiteInfos [FS][Callee] =
1965	std::make_unique<CallsiteInfo>(args&: Callee, args: SmallVector<unsigned>());
1966	CallsiteInfo *NewCallsiteInfo =
1967	FunctionCalleesToSynthesizedCallsiteInfos [FS][Callee].get();
1968	FoundCalleeChain.push_back(x: {NewCallsiteInfo, FS});
1969	};
1970
1971	// Look for tail calls in this function, and check if they either call the
1972	// profiled callee directly, or indirectly (via a recursive search).
1973	// Only succeed if there is a single unique tail call chain found between the
1974	// profiled caller and callee, otherwise we could perform incorrect cloning.
1975	bool FoundSingleCalleeChain = false;
1976	for (auto &S : CurCallee.getSummaryList()) {
1977	if (!GlobalValue::isLocalLinkage(Linkage: S ->linkage()) &&
1978	!isPrevailing (CurCallee.getGUID(), S.get()))
1979	continue;
1980	auto *FS = dyn_cast<FunctionSummary>(Val: S ->getBaseObject());
1981	if (!FS)
1982	continue;
1983	auto FSVI = CurCallee;
1984	auto *AS = dyn_cast<AliasSummary>(Val: S.get());
1985	if (AS)
1986	FSVI = AS->getAliaseeVI();
1987	for (auto &CallEdge : FS->calls()) {
1988	if (!CallEdge.second.hasTailCall())
1989	continue;
1990	if (CallEdge.first == ProfiledCallee) {
1991	if (FoundSingleCalleeChain) {
1992	FoundMultipleCalleeChains = true;
1993	return false;
1994	}
1995	FoundSingleCalleeChain = true;
1996	FoundProfiledCalleeCount ++;
1997	FoundProfiledCalleeDepth += Depth;
1998	if (Depth > FoundProfiledCalleeMaxDepth)
1999	FoundProfiledCalleeMaxDepth = Depth;
2000	CreateAndSaveCallsiteInfo (CallEdge.first, FS);
2001	// Add FS to FSToVIMap in case it isn't already there.
2002	assert(!FSToVIMap.count(FS) \|\| FSToVIMap[FS] == FSVI);
2003	FSToVIMap [FS] = FSVI;
2004	} else if (findProfiledCalleeThroughTailCalls(
2005	ProfiledCallee, CurCallee: CallEdge.first, Depth: Depth + `1`,
2006	FoundCalleeChain, FoundMultipleCalleeChains)) {
2007	if (FoundMultipleCalleeChains)
2008	return false;
2009	if (FoundSingleCalleeChain) {
2010	FoundMultipleCalleeChains = true;
2011	return false;
2012	}
2013	FoundSingleCalleeChain = true;
2014	CreateAndSaveCallsiteInfo (CallEdge.first, FS);
2015	// Add FS to FSToVIMap in case it isn't already there.
2016	assert(!FSToVIMap.count(FS) \|\| FSToVIMap[FS] == FSVI);
2017	FSToVIMap [FS] = FSVI;
2018	}
2019	}
2020	}
2021
2022	return FoundSingleCalleeChain;
2023	}
2024
2025	bool IndexCallsiteContextGraph::calleeMatchesFunc(
2026	IndexCall &Call, const FunctionSummary *Func,
2027	const FunctionSummary *CallerFunc,
2028	std::vector<std::pair<IndexCall, FunctionSummary *>> &FoundCalleeChain) {
2029	ValueInfo Callee =
2030	dyn_cast_if_present<CallsiteInfo *>(Val: Call.getBase())->Callee;
2031	// If there is no summary list then this is a call to an externally defined
2032	// symbol.
2033	AliasSummary *Alias =
2034	Callee.getSummaryList().empty()
2035	? nullptr
2036	: dyn_cast<AliasSummary>(Val: Callee.getSummaryList()[`0`].get());
2037	assert(FSToVIMap.count(Func));
2038	auto FuncVI = FSToVIMap [Func];
2039	if (Callee == FuncVI \|\|
2040	// If callee is an alias, check the aliasee, since only function
2041	// summary base objects will contain the stack node summaries and thus
2042	// get a context node.
2043	(Alias && Alias->getAliaseeVI() == FuncVI))
2044	return true;
2045
2046	// Recursively search for the profiled callee through tail calls starting with
2047	// the actual Callee. The discovered tail call chain is saved in
2048	// FoundCalleeChain, and we will fixup the graph to include these callsites
2049	// after returning.
2050	// FIXME: We will currently redo the same recursive walk if we find the same
2051	// mismatched callee from another callsite. We can improve this with more
2052	// bookkeeping of the created chain of new nodes for each mismatch.
2053	unsigned Depth = `1`;
2054	bool FoundMultipleCalleeChains = false;
2055	if (!findProfiledCalleeThroughTailCalls(
2056	ProfiledCallee: FuncVI, CurCallee: Callee, Depth, FoundCalleeChain, FoundMultipleCalleeChains)) {
2057	LLVM_DEBUG(dbgs() << "Not found through unique tail call chain: " << FuncVI
2058	<< " from " << FSToVIMap[CallerFunc]
2059	<< " that actually called " << Callee
2060	<< (FoundMultipleCalleeChains
2061	? " (found multiple possible chains)"
2062	: "")
2063	<< "\n");
2064	if (FoundMultipleCalleeChains)
2065	FoundProfiledCalleeNonUniquelyCount ++;
2066	return false;
2067	}
2068
2069	return true;
2070	}
2071
2072	static std::string getAllocTypeString(uint8_t AllocTypes) {
2073	if (!AllocTypes)
2074	return "None";
2075	std::string Str;
2076	if (AllocTypes & (uint8_t)AllocationType::NotCold)
2077	Str += "NotCold";
2078	if (AllocTypes & (uint8_t)AllocationType::Cold)
2079	Str += "Cold";
2080	return Str;
2081	}
2082
2083	template <typename DerivedCCG, typename FuncTy, typename CallTy>
2084	void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::ContextNode::dump()
2085	const {
2086	print(OS&: dbgs());
2087	dbgs() << "\n";
2088	}
2089
2090	template <typename DerivedCCG, typename FuncTy, typename CallTy>
2091	void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::ContextNode::print(
2092	raw_ostream &OS) const {
2093	OS << "Node " << this << "\n";
2094	OS << "\t";
2095	printCall(OS);
2096	if (Recursive)
2097	OS << " (recursive)";
2098	OS << "\n";
2099	OS << "\tAllocTypes: " << getAllocTypeString(AllocTypes) << "\n";
2100	OS << "\tContextIds:";
2101	std::vector<uint32_t> SortedIds(ContextIds.begin(), ContextIds.end());
2102	std::sort(first: SortedIds.begin(), last: SortedIds.end());
2103	for (auto Id : SortedIds)
2104	OS << " " << Id;
2105	OS << "\n";
2106	OS << "\tCalleeEdges:\n";
2107	for (auto &Edge : CalleeEdges)
2108	OS << "\t\t" << *Edge << "\n";
2109	OS << "\tCallerEdges:\n";
2110	for (auto &Edge : CallerEdges)
2111	OS << "\t\t" << *Edge << "\n";
2112	if (!Clones.empty()) {
2113	OS << "\tClones: ";
2114	FieldSeparator FS;
2115	for (auto *Clone : Clones)
2116	OS << FS << Clone;
2117	OS << "\n";
2118	} else if (CloneOf) {
2119	OS << "\tClone of " << CloneOf << "\n";
2120	}
2121	}
2122
2123	template <typename DerivedCCG, typename FuncTy, typename CallTy>
2124	void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::ContextEdge::dump()
2125	const {
2126	print(OS&: dbgs());
2127	dbgs() << "\n";
2128	}
2129
2130	template <typename DerivedCCG, typename FuncTy, typename CallTy>
2131	void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::ContextEdge::print(
2132	raw_ostream &OS) const {
2133	OS << "Edge from Callee " << Callee << " to Caller: " << Caller
2134	<< " AllocTypes: " << getAllocTypeString(AllocTypes);
2135	OS << " ContextIds:";
2136	std::vector<uint32_t> SortedIds(ContextIds.begin(), ContextIds.end());
2137	std::sort(first: SortedIds.begin(), last: SortedIds.end());
2138	for (auto Id : SortedIds)
2139	OS << " " << Id;
2140	}
2141
2142	template <typename DerivedCCG, typename FuncTy, typename CallTy>
2143	void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::dump() const {
2144	print(OS&: dbgs());
2145	}
2146
2147	template <typename DerivedCCG, typename FuncTy, typename CallTy>
2148	void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::print(
2149	raw_ostream &OS) const {
2150	OS << "Callsite Context Graph:\n";
2151	using GraphType = const CallsiteContextGraph<DerivedCCG, FuncTy, CallTy> *;
2152	for (const auto Node : nodes<GraphType>(this)) {
2153	if (Node->isRemoved())
2154	continue;
2155	Node->print(OS);
2156	OS << "\n";
2157	}
2158	}
2159
2160	template <typename DerivedCCG, typename FuncTy, typename CallTy>
2161	static void checkEdge(
2162	const std::shared_ptr<ContextEdge<DerivedCCG, FuncTy, CallTy>> &Edge) {
2163	// Confirm that alloc type is not None and that we have at least one context
2164	// id.
2165	assert(Edge->AllocTypes != (uint8_t)AllocationType::None);
2166	assert(!Edge->ContextIds.empty());
2167	}
2168
2169	template <typename DerivedCCG, typename FuncTy, typename CallTy>
2170	static void checkNode(const ContextNode<DerivedCCG, FuncTy, CallTy> *Node,
2171	bool CheckEdges = true) {
2172	if (Node->isRemoved())
2173	return;
2174	// Node's context ids should be the union of both its callee and caller edge
2175	// context ids.
2176	if (Node->CallerEdges.size()) {
2177	auto EI = Node->CallerEdges.begin();
2178	auto &FirstEdge = *EI;
2179	EI++;
2180	DenseSet<uint32_t> CallerEdgeContextIds(FirstEdge->ContextIds);
2181	for (; EI != Node->CallerEdges.end(); EI++) {
2182	const auto &Edge = *EI;
2183	if (CheckEdges)
2184	checkEdge<DerivedCCG, FuncTy, CallTy>(Edge);
2185	set_union(CallerEdgeContextIds, Edge->ContextIds);
2186	}
2187	// Node can have more context ids than callers if some contexts terminate at
2188	// node and some are longer.
2189	assert(Node->ContextIds == CallerEdgeContextIds \|\|
2190	set_is_subset(CallerEdgeContextIds, Node->ContextIds));
2191	}
2192	if (Node->CalleeEdges.size()) {
2193	auto EI = Node->CalleeEdges.begin();
2194	auto &FirstEdge = *EI;
2195	EI++;
2196	DenseSet<uint32_t> CalleeEdgeContextIds(FirstEdge->ContextIds);
2197	for (; EI != Node->CalleeEdges.end(); EI++) {
2198	const auto &Edge = *EI;
2199	if (CheckEdges)
2200	checkEdge<DerivedCCG, FuncTy, CallTy>(Edge);
2201	set_union(CalleeEdgeContextIds, Edge->ContextIds);
2202	}
2203	assert(Node->ContextIds == CalleeEdgeContextIds);
2204	}
2205	}
2206
2207	template <typename DerivedCCG, typename FuncTy, typename CallTy>
2208	void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::check() const {
2209	using GraphType = const CallsiteContextGraph<DerivedCCG, FuncTy, CallTy> *;
2210	for (const auto Node : nodes<GraphType>(this)) {
2211	checkNode<DerivedCCG, FuncTy, CallTy>(Node, /CheckEdges=/false);
2212	for (auto &Edge : Node->CallerEdges)
2213	checkEdge<DerivedCCG, FuncTy, CallTy>(Edge);
2214	}
2215	}
2216
2217	template <typename DerivedCCG, typename FuncTy, typename CallTy>
2218	struct GraphTraits<const CallsiteContextGraph<DerivedCCG, FuncTy, CallTy> *> {
2219	using GraphType = const CallsiteContextGraph<DerivedCCG, FuncTy, CallTy> *;
2220	using NodeRef = const ContextNode<DerivedCCG, FuncTy, CallTy> *;
2221
2222	using NodePtrTy = std::unique_ptr<ContextNode<DerivedCCG, FuncTy, CallTy>>;
2223	static NodeRef getNode(const NodePtrTy &P) { return P.get(); }
2224
2225	using nodes_iterator =
2226	mapped_iterator<typename std::vector<NodePtrTy>::const_iterator,
2227	decltype(&getNode)>;
2228
2229	static nodes_iterator nodes_begin(GraphType G) {
2230	return nodes_iterator(G->NodeOwner.begin(), &getNode);
2231	}
2232
2233	static nodes_iterator nodes_end(GraphType G) {
2234	return nodes_iterator(G->NodeOwner.end(), &getNode);
2235	}
2236
2237	static NodeRef getEntryNode(GraphType G) {
2238	return G->NodeOwner.begin()->get();
2239	}
2240
2241	using EdgePtrTy = std::shared_ptr<ContextEdge<DerivedCCG, FuncTy, CallTy>>;
2242	static const ContextNode<DerivedCCG, FuncTy, CallTy> *
2243	GetCallee(const EdgePtrTy &P) {
2244	return P->Callee;
2245	}
2246
2247	using ChildIteratorType =
2248	mapped_iterator<typename std::vector<std::shared_ptr<ContextEdge<
2249	DerivedCCG, FuncTy, CallTy>>>::const_iterator,
2250	decltype(&GetCallee)>;
2251
2252	static ChildIteratorType child_begin(NodeRef N) {
2253	return ChildIteratorType(N->CalleeEdges.begin(), &GetCallee);
2254	}
2255
2256	static ChildIteratorType child_end(NodeRef N) {
2257	return ChildIteratorType(N->CalleeEdges.end(), &GetCallee);
2258	}
2259	};
2260
2261	template <typename DerivedCCG, typename FuncTy, typename CallTy>
2262	struct DOTGraphTraits<const CallsiteContextGraph<DerivedCCG, FuncTy, CallTy> *>
2263	: public DefaultDOTGraphTraits {
2264	DOTGraphTraits(bool IsSimple = false) : DefaultDOTGraphTraits(IsSimple) {}
2265
2266	using GraphType = const CallsiteContextGraph<DerivedCCG, FuncTy, CallTy> *;
2267	using GTraits = GraphTraits<GraphType>;
2268	using NodeRef = typename GTraits::NodeRef;
2269	using ChildIteratorType = typename GTraits::ChildIteratorType;
2270
2271	static std::string getNodeLabel(NodeRef Node, GraphType G) {
2272	std::string LabelString =
2273	(Twine ("OrigId: ") + (Node->IsAllocation ? "Alloc" : "") +
2274	Twine(Node->OrigStackOrAllocId))
2275	.str();
2276	LabelString += "\n";
2277	if (Node->hasCall()) {
2278	auto Func = G->NodeToCallingFunc.find(Node);
2279	assert(Func != G->NodeToCallingFunc.end());
2280	LabelString +=
2281	G->getLabel(Func->second, Node->Call.call(), Node->Call.cloneNo());
2282	} else {
2283	LabelString += "null call";
2284	if (Node->Recursive)
2285	LabelString += " (recursive)";
2286	else
2287	LabelString += " (external)";
2288	}
2289	return LabelString;
2290	}
2291
2292	static std::string getNodeAttributes(NodeRef Node, GraphType) {
2293	std::string AttributeString = (Twine ("tooltip=\"") + getNodeId(Node) + " " +
2294	getContextIds(ContextIds: Node->ContextIds) + "\"")
2295	.str();
2296	AttributeString +=
2297	(Twine (",fillcolor=\"") + getColor(AllocTypes: Node->AllocTypes) + "\"").str();
2298	AttributeString += ",style=\"filled\"";
2299	if (Node->CloneOf) {
2300	AttributeString += ",color=\"blue\"";
2301	AttributeString += ",style=\"filled,bold,dashed\"";
2302	} else
2303	AttributeString += ",style=\"filled\"";
2304	return AttributeString;
2305	}
2306
2307	static std::string getEdgeAttributes(NodeRef, ChildIteratorType ChildIter,
2308	GraphType) {
2309	auto &Edge = *(ChildIter.getCurrent());
2310	return (Twine ("tooltip=\"") + getContextIds(ContextIds: Edge->ContextIds) + "\"" +
2311	Twine (",fillcolor=\"") + getColor(AllocTypes: Edge->AllocTypes) + "\"")
2312	.str();
2313	}
2314
2315	// Since the NodeOwners list includes nodes that are no longer connected to
2316	// the graph, skip them here.
2317	static bool isNodeHidden(NodeRef Node, GraphType) {
2318	return Node->isRemoved();
2319	}
2320
2321	private:
2322	static std::string getContextIds(const DenseSet<uint32_t> &ContextIds) {
2323	std::string IdString = "ContextIds:";
2324	if (ContextIds.size() < `100`) {
2325	std::vector<uint32_t> SortedIds(ContextIds.begin(), ContextIds.end());
2326	std::sort(first: SortedIds.begin(), last: SortedIds.end());
2327	for (auto Id : SortedIds)
2328	IdString += (" " + Twine (Id)).str();
2329	} else {
2330	IdString += (" (" + Twine (ContextIds.size()) + " ids)").str();
2331	}
2332	return IdString;
2333	}
2334
2335	static std::string getColor(uint8_t AllocTypes) {
2336	if (AllocTypes == (uint8_t)AllocationType::NotCold)
2337	// Color "brown1" actually looks like a lighter red.
2338	return "brown1";
2339	if (AllocTypes == (uint8_t)AllocationType::Cold)
2340	return "cyan";
2341	if (AllocTypes ==
2342	((uint8_t)AllocationType::NotCold \| (uint8_t)AllocationType::Cold))
2343	// Lighter purple.
2344	return "mediumorchid1";
2345	return "gray";
2346	}
2347
2348	static std::string getNodeId(NodeRef Node) {
2349	std::stringstream SStream;
2350	SStream << std::hex << "N0x" << (unsigned long long)Node;
2351	std::string Result = SStream.str();
2352	return Result;
2353	}
2354	};
2355
2356	template <typename DerivedCCG, typename FuncTy, typename CallTy>
2357	void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::exportToDot(
2358	std::string Label) const {
2359	WriteGraph(this, "", false, Label,
2360	DotFilePathPrefix + "ccg." + Label + ".dot");
2361	}
2362
2363	template <typename DerivedCCG, typename FuncTy, typename CallTy>
2364	typename CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::ContextNode *
2365	CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::moveEdgeToNewCalleeClone(
2366	const std::shared_ptr<ContextEdge> &Edge, EdgeIter *CallerEdgeI,
2367	DenseSet<uint32_t> ContextIdsToMove) {
2368	ContextNode *Node = Edge->Callee;
2369	NodeOwner.push_back(
2370	std::make_unique<ContextNode>(Node->IsAllocation, Node->Call));
2371	ContextNode *Clone = NodeOwner.back().get();
2372	Node->addClone(Clone);
2373	assert(NodeToCallingFunc.count(Node));
2374	NodeToCallingFunc[Clone] = NodeToCallingFunc[Node];
2375	moveEdgeToExistingCalleeClone(Edge, NewCallee: Clone, CallerEdgeI, /NewClone=/true,
2376	ContextIdsToMove);
2377	return Clone;
2378	}
2379
2380	template <typename DerivedCCG, typename FuncTy, typename CallTy>
2381	void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::
2382	moveEdgeToExistingCalleeClone(const std::shared_ptr<ContextEdge> &Edge,
2383	ContextNode NewCallee, EdgeIter CallerEdgeI,
2384	bool NewClone,
2385	DenseSet<uint32_t> ContextIdsToMove) {
2386	// NewCallee and Edge's current callee must be clones of the same original
2387	// node (Edge's current callee may be the original node too).
2388	assert(NewCallee->getOrigNode() == Edge->Callee->getOrigNode());
2389
2390	ContextNode *OldCallee = Edge->Callee;
2391
2392	// We might already have an edge to the new callee from earlier cloning for a
2393	// different allocation. If one exists we will reuse it.
2394	auto ExistingEdgeToNewCallee = NewCallee->findEdgeFromCaller(Edge->Caller);
2395
2396	// Callers will pass an empty ContextIdsToMove set when they want to move the
2397	// edge. Copy in Edge's ids for simplicity.
2398	if (ContextIdsToMove.empty())
2399	ContextIdsToMove = Edge->getContextIds();
2400
2401	// If we are moving all of Edge's ids, then just move the whole Edge.
2402	// Otherwise only move the specified subset, to a new edge if needed.
2403	if (Edge->getContextIds().size() == ContextIdsToMove.size()) {
2404	// Moving the whole Edge.
2405	if (CallerEdgeI)
2406	CallerEdgeI = OldCallee->CallerEdges.erase(CallerEdgeI);
2407	else
2408	OldCallee->eraseCallerEdge(Edge.get());
2409	if (ExistingEdgeToNewCallee) {
2410	// Since we already have an edge to NewCallee, simply move the ids
2411	// onto it, and remove the existing Edge.
2412	ExistingEdgeToNewCallee->getContextIds().insert(ContextIdsToMove.begin(),
2413	ContextIdsToMove.end());
2414	ExistingEdgeToNewCallee->AllocTypes \|= Edge->AllocTypes;
2415	assert(Edge->ContextIds == ContextIdsToMove);
2416	Edge->ContextIds.clear();
2417	Edge->AllocTypes = (uint8_t)AllocationType::None;
2418	Edge->Caller->eraseCalleeEdge(Edge.get());
2419	} else {
2420	// Otherwise just reconnect Edge to NewCallee.
2421	Edge->Callee = NewCallee;
2422	NewCallee->CallerEdges.push_back(Edge);
2423	// Don't need to update Edge's context ids since we are simply
2424	// reconnecting it.
2425	}
2426	// In either case, need to update the alloc types on New Callee.
2427	NewCallee->AllocTypes \|= Edge->AllocTypes;
2428	} else {
2429	// Only moving a subset of Edge's ids.
2430	if (CallerEdgeI)
2431	++CallerEdgeI;
2432	// Compute the alloc type of the subset of ids being moved.
2433	auto CallerEdgeAllocType = computeAllocType(ContextIds&: ContextIdsToMove);
2434	if (ExistingEdgeToNewCallee) {
2435	// Since we already have an edge to NewCallee, simply move the ids
2436	// onto it.
2437	ExistingEdgeToNewCallee->getContextIds().insert(ContextIdsToMove.begin(),
2438	ContextIdsToMove.end());
2439	ExistingEdgeToNewCallee->AllocTypes \|= CallerEdgeAllocType;
2440	} else {
2441	// Otherwise, create a new edge to NewCallee for the ids being moved.
2442	auto NewEdge = std::make_shared<ContextEdge>(
2443	NewCallee, Edge->Caller, CallerEdgeAllocType, ContextIdsToMove);
2444	Edge->Caller->CalleeEdges.push_back(NewEdge);
2445	NewCallee->CallerEdges.push_back(NewEdge);
2446	}
2447	// In either case, need to update the alloc types on NewCallee, and remove
2448	// those ids and update the alloc type on the original Edge.
2449	NewCallee->AllocTypes \|= CallerEdgeAllocType;
2450	set_subtract(Edge->ContextIds, ContextIdsToMove);
2451	Edge->AllocTypes = computeAllocType(ContextIds&: Edge->ContextIds);
2452	}
2453	// Now perform some updates that are common to all cases: the NewCallee gets
2454	// the moved ids added, and we need to remove those ids from OldCallee and
2455	// update its alloc type (NewCallee alloc type updates handled above).
2456	NewCallee->ContextIds.insert(ContextIdsToMove.begin(),
2457	ContextIdsToMove.end());
2458	set_subtract(OldCallee->ContextIds, ContextIdsToMove);
2459	OldCallee->AllocTypes = computeAllocType(ContextIds&: OldCallee->ContextIds);
2460	// OldCallee alloc type should be None iff its context id set is now empty.
2461	assert((OldCallee->AllocTypes == (uint8_t)AllocationType::None) ==
2462	OldCallee->ContextIds.empty());
2463	// Now walk the old callee node's callee edges and move Edge's context ids
2464	// over to the corresponding edge into the clone (which is created here if
2465	// this is a newly created clone).
2466	for (auto &OldCalleeEdge : OldCallee->CalleeEdges) {
2467	// The context ids moving to the new callee are the subset of this edge's
2468	// context ids and the context ids on the caller edge being moved.
2469	DenseSet<uint32_t> EdgeContextIdsToMove =
2470	set_intersection(OldCalleeEdge->getContextIds(), ContextIdsToMove);
2471	set_subtract(OldCalleeEdge->getContextIds(), EdgeContextIdsToMove);
2472	OldCalleeEdge->AllocTypes =
2473	computeAllocType(ContextIds&: OldCalleeEdge->getContextIds());
2474	if (!NewClone) {
2475	// Update context ids / alloc type on corresponding edge to NewCallee.
2476	// There is a chance this may not exist if we are reusing an existing
2477	// clone, specifically during function assignment, where we would have
2478	// removed none type edges after creating the clone. If we can't find
2479	// a corresponding edge there, fall through to the cloning below.
2480	if (auto *NewCalleeEdge =
2481	NewCallee->findEdgeFromCallee(OldCalleeEdge->Callee)) {
2482	NewCalleeEdge->getContextIds().insert(EdgeContextIdsToMove.begin(),
2483	EdgeContextIdsToMove.end());
2484	NewCalleeEdge->AllocTypes \|= computeAllocType(ContextIds&: EdgeContextIdsToMove);
2485	continue;
2486	}
2487	}
2488	auto NewEdge = std::make_shared<ContextEdge>(
2489	OldCalleeEdge->Callee, NewCallee,
2490	computeAllocType(ContextIds&: EdgeContextIdsToMove), EdgeContextIdsToMove);
2491	NewCallee->CalleeEdges.push_back(NewEdge);
2492	NewEdge->Callee->CallerEdges.push_back(NewEdge);
2493	}
2494	if (VerifyCCG) {
2495	checkNode<DerivedCCG, FuncTy, CallTy>(OldCallee, /CheckEdges=/false);
2496	checkNode<DerivedCCG, FuncTy, CallTy>(NewCallee, /CheckEdges=/false);
2497	for (const auto &OldCalleeEdge : OldCallee->CalleeEdges)
2498	checkNode<DerivedCCG, FuncTy, CallTy>(OldCalleeEdge->Callee,
2499	/CheckEdges=/false);
2500	for (const auto &NewCalleeEdge : NewCallee->CalleeEdges)
2501	checkNode<DerivedCCG, FuncTy, CallTy>(NewCalleeEdge->Callee,
2502	/CheckEdges=/false);
2503	}
2504	}
2505
2506	template <typename DerivedCCG, typename FuncTy, typename CallTy>
2507	void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::
2508	recursivelyRemoveNoneTypeCalleeEdges(
2509	ContextNode Node, DenseSet<const* ContextNode *> &Visited) {
2510	auto Inserted = Visited.insert(Node);
2511	if (!Inserted.second)
2512	return;
2513
2514	removeNoneTypeCalleeEdges(Node);
2515
2516	for (auto *Clone : Node->Clones)
2517	recursivelyRemoveNoneTypeCalleeEdges(Node: Clone, Visited);
2518
2519	// The recursive call may remove some of this Node's caller edges.
2520	// Iterate over a copy and skip any that were removed.
2521	auto CallerEdges = Node->CallerEdges;
2522	for (auto &Edge : CallerEdges) {
2523	// Skip any that have been removed by an earlier recursive call.
2524	if (Edge->Callee == nullptr && Edge->Caller == nullptr) {
2525	assert(!std::count(Node->CallerEdges.begin(), Node->CallerEdges.end(),
2526	Edge));
2527	continue;
2528	}
2529	recursivelyRemoveNoneTypeCalleeEdges(Node: Edge->Caller, Visited);
2530	}
2531	}
2532
2533	template <typename DerivedCCG, typename FuncTy, typename CallTy>
2534	void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::identifyClones() {
2535	DenseSet<const ContextNode *> Visited;
2536	for (auto &Entry : AllocationCallToContextNodeMap) {
2537	Visited.clear();
2538	identifyClones(Entry.second, Visited, Entry.second->ContextIds);
2539	}
2540	Visited.clear();
2541	for (auto &Entry : AllocationCallToContextNodeMap)
2542	recursivelyRemoveNoneTypeCalleeEdges(Node: Entry.second, Visited);
2543	if (VerifyCCG)
2544	check();
2545	}
2546
2547	// helper function to check an AllocType is cold or notcold or both.
2548	bool checkColdOrNotCold(uint8_t AllocType) {
2549	return (AllocType == (uint8_t)AllocationType::Cold) \|\|
2550	(AllocType == (uint8_t)AllocationType::NotCold) \|\|
2551	(AllocType ==
2552	((uint8_t)AllocationType::Cold \| (uint8_t)AllocationType::NotCold));
2553	}
2554
2555	template <typename DerivedCCG, typename FuncTy, typename CallTy>
2556	void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::identifyClones(
2557	ContextNode Node, DenseSet<const* ContextNode *> &Visited,
2558	const DenseSet<uint32_t> &AllocContextIds) {
2559	if (VerifyNodes)
2560	checkNode<DerivedCCG, FuncTy, CallTy>(Node, /CheckEdges=/false);
2561	assert(!Node->CloneOf);
2562
2563	// If Node as a null call, then either it wasn't found in the module (regular
2564	// LTO) or summary index (ThinLTO), or there were other conditions blocking
2565	// cloning (e.g. recursion, calls multiple targets, etc).
2566	// Do this here so that we don't try to recursively clone callers below, which
2567	// isn't useful at least for this node.
2568	if (!Node->hasCall())
2569	return;
2570
2571	#ifndef NDEBUG
2572	auto Insert =
2573	#endif
2574	Visited.insert(Node);
2575	// We should not have visited this node yet.
2576	assert(Insert.second);
2577	// The recursive call to identifyClones may delete the current edge from the
2578	// CallerEdges vector. Make a copy and iterate on that, simpler than passing
2579	// in an iterator and having recursive call erase from it. Other edges may
2580	// also get removed during the recursion, which will have null Callee and
2581	// Caller pointers (and are deleted later), so we skip those below.
2582	{
2583	auto CallerEdges = Node->CallerEdges;
2584	for (auto &Edge : CallerEdges) {
2585	// Skip any that have been removed by an earlier recursive call.
2586	if (Edge->Callee == nullptr && Edge->Caller == nullptr) {
2587	assert(!llvm::count(Node->CallerEdges, Edge));
2588	continue;
2589	}
2590	// Ignore any caller we previously visited via another edge.
2591	if (!Visited.count(Edge->Caller) && !Edge->Caller->CloneOf) {
2592	identifyClones(Edge->Caller, Visited, AllocContextIds);
2593	}
2594	}
2595	}
2596
2597	// Check if we reached an unambiguous call or have have only a single caller.
2598	if (hasSingleAllocType(Node->AllocTypes) \|\| Node->CallerEdges.size() <= `1`)
2599	return;
2600
2601	// We need to clone.
2602
2603	// Try to keep the original version as alloc type NotCold. This will make
2604	// cases with indirect calls or any other situation with an unknown call to
2605	// the original function get the default behavior. We do this by sorting the
2606	// CallerEdges of the Node we will clone by alloc type.
2607	//
2608	// Give NotCold edge the lowest sort priority so those edges are at the end of
2609	// the caller edges vector, and stay on the original version (since the below
2610	// code clones greedily until it finds all remaining edges have the same type
2611	// and leaves the remaining ones on the original Node).
2612	//
2613	// We shouldn't actually have any None type edges, so the sorting priority for
2614	// that is arbitrary, and we assert in that case below.
2615	const unsigned AllocTypeCloningPriority[] = {/None/ `3`, /NotCold/ `4`,
2616	/Cold/ `1`,
2617	/NotColdCold/ `2`};
2618	std::stable_sort(Node->CallerEdges.begin(), Node->CallerEdges.end(),
2619	[&](const std::shared_ptr<ContextEdge> &A,
2620	const std::shared_ptr<ContextEdge> &B) {
2621	// Nodes with non-empty context ids should be sorted before
2622	// those with empty context ids.
2623	if (A->ContextIds.empty())
2624	// Either B ContextIds are non-empty (in which case we
2625	// should return false because B < A), or B ContextIds
2626	// are empty, in which case they are equal, and we should
2627	// maintain the original relative ordering.
2628	return false;
2629	if (B->ContextIds.empty())
2630	return true;
2631
2632	if (A->AllocTypes == B->AllocTypes)
2633	// Use the first context id for each edge as a
2634	// tie-breaker.
2635	return A->ContextIds.begin() < B->ContextIds.begin();
2636	return AllocTypeCloningPriority[A->AllocTypes] <
2637	AllocTypeCloningPriority[B->AllocTypes];
2638	});
2639
2640	assert(Node->AllocTypes != (uint8_t)AllocationType::None);
2641
2642	// Iterate until we find no more opportunities for disambiguating the alloc
2643	// types via cloning. In most cases this loop will terminate once the Node
2644	// has a single allocation type, in which case no more cloning is needed.
2645	// We need to be able to remove Edge from CallerEdges, so need to adjust
2646	// iterator inside the loop.
2647	for (auto EI = Node->CallerEdges.begin(); EI != Node->CallerEdges.end();) {
2648	auto CallerEdge = *EI;
2649
2650	// See if cloning the prior caller edge left this node with a single alloc
2651	// type or a single caller. In that case no more cloning of Node is needed.
2652	if (hasSingleAllocType(Node->AllocTypes) \|\| Node->CallerEdges.size() <= `1`)
2653	break;
2654
2655	// Only need to process the ids along this edge pertaining to the given
2656	// allocation.
2657	auto CallerEdgeContextsForAlloc =
2658	set_intersection(CallerEdge->getContextIds(), AllocContextIds);
2659	if (CallerEdgeContextsForAlloc.empty()) {
2660	++EI;
2661	continue;
2662	}
2663	auto CallerAllocTypeForAlloc = computeAllocType(ContextIds&: CallerEdgeContextsForAlloc);
2664
2665	// Compute the node callee edge alloc types corresponding to the context ids
2666	// for this caller edge.
2667	std::vector<uint8_t> CalleeEdgeAllocTypesForCallerEdge;
2668	CalleeEdgeAllocTypesForCallerEdge.reserve(n: Node->CalleeEdges.size());
2669	for (auto &CalleeEdge : Node->CalleeEdges)
2670	CalleeEdgeAllocTypesForCallerEdge.push_back(intersectAllocTypes(
2671	Node1Ids: CalleeEdge->getContextIds(), Node2Ids: CallerEdgeContextsForAlloc));
2672
2673	// Don't clone if doing so will not disambiguate any alloc types amongst
2674	// caller edges (including the callee edges that would be cloned).
2675	// Otherwise we will simply move all edges to the clone.
2676	//
2677	// First check if by cloning we will disambiguate the caller allocation
2678	// type from node's allocation type. Query allocTypeToUse so that we don't
2679	// bother cloning to distinguish NotCold+Cold from NotCold. Note that
2680	// neither of these should be None type.
2681	//
2682	// Then check if by cloning node at least one of the callee edges will be
2683	// disambiguated by splitting out different context ids.
2684	assert(CallerEdge->AllocTypes != (uint8_t)AllocationType::None);
2685	assert(Node->AllocTypes != (uint8_t)AllocationType::None);
2686	if (allocTypeToUse(CallerAllocTypeForAlloc) ==
2687	allocTypeToUse(Node->AllocTypes) &&
2688	allocTypesMatch<DerivedCCG, FuncTy, CallTy>(
2689	CalleeEdgeAllocTypesForCallerEdge, Node->CalleeEdges)) {
2690	++EI;
2691	continue;
2692	}
2693
2694	// First see if we can use an existing clone. Check each clone and its
2695	// callee edges for matching alloc types.
2696	ContextNode Clone = nullptr*;
2697	for (auto *CurClone : Node->Clones) {
2698	if (allocTypeToUse(CurClone->AllocTypes) !=
2699	allocTypeToUse(CallerAllocTypeForAlloc))
2700	continue;
2701
2702	if (!allocTypesMatch<DerivedCCG, FuncTy, CallTy>(
2703	CalleeEdgeAllocTypesForCallerEdge, CurClone->CalleeEdges))
2704	continue;
2705	Clone = CurClone;
2706	break;
2707	}
2708
2709	// The edge iterator is adjusted when we move the CallerEdge to the clone.
2710	if (Clone)
2711	moveEdgeToExistingCalleeClone(Edge: CallerEdge, NewCallee: Clone, CallerEdgeI: &EI, /NewClone=/false,
2712	ContextIdsToMove: CallerEdgeContextsForAlloc);
2713	else
2714	Clone =
2715	moveEdgeToNewCalleeClone(Edge: CallerEdge, CallerEdgeI: &EI, ContextIdsToMove: CallerEdgeContextsForAlloc);
2716
2717	assert(EI == Node->CallerEdges.end() \|\|
2718	Node->AllocTypes != (uint8_t)AllocationType::None);
2719	// Sanity check that no alloc types on clone or its edges are None.
2720	assert(Clone->AllocTypes != (uint8_t)AllocationType::None);
2721	}
2722
2723	// We should still have some context ids on the original Node.
2724	assert(!Node->ContextIds.empty());
2725
2726	// Sanity check that no alloc types on node or edges are None.
2727	assert(Node->AllocTypes != (uint8_t)AllocationType::None);
2728
2729	if (VerifyNodes)
2730	checkNode<DerivedCCG, FuncTy, CallTy>(Node, /CheckEdges=/false);
2731	}
2732
2733	void ModuleCallsiteContextGraph::updateAllocationCall(
2734	CallInfo &Call, AllocationType AllocType) {
2735	std::string AllocTypeString = getAllocTypeAttributeString(Type: AllocType);
2736	auto A = llvm::Attribute::get(Context&: Call.call()->getFunction()->getContext(),
2737	Kind: "memprof", Val: AllocTypeString);
2738	cast<CallBase>(Val: Call.call())->addFnAttr(Attr: A);
2739	OREGetter (Call.call()->getFunction())
2740	.emit(OptDiag&: OptimizationRemark (DEBUG_TYPE, "MemprofAttribute", Call.call())
2741	<< ore::NV ("AllocationCall", Call.call()) << " in clone "
2742	<< ore::NV ("Caller", Call.call()->getFunction())
2743	<< " marked with memprof allocation attribute "
2744	<< ore::NV ("Attribute", AllocTypeString));
2745	}
2746
2747	void IndexCallsiteContextGraph::updateAllocationCall(CallInfo &Call,
2748	AllocationType AllocType) {
2749	auto AI = Call.call().dyn_cast<AllocInfo >();
2750	assert(AI);
2751	assert(AI->Versions.size() > Call.cloneNo());
2752	AI->Versions [Call.cloneNo()] = (uint8_t)AllocType;
2753	}
2754
2755	void ModuleCallsiteContextGraph::updateCall(CallInfo &CallerCall,
2756	FuncInfo CalleeFunc) {
2757	if (CalleeFunc.cloneNo() > `0`)
2758	cast<CallBase>(Val: CallerCall.call())->setCalledFunction(CalleeFunc.func());
2759	OREGetter (CallerCall.call()->getFunction())
2760	.emit(OptDiag&: OptimizationRemark (DEBUG_TYPE, "MemprofCall", CallerCall.call())
2761	<< ore::NV ("Call", CallerCall.call()) << " in clone "
2762	<< ore::NV ("Caller", CallerCall.call()->getFunction())
2763	<< " assigned to call function clone "
2764	<< ore::NV ("Callee", CalleeFunc.func()));
2765	}
2766
2767	void IndexCallsiteContextGraph::updateCall(CallInfo &CallerCall,
2768	FuncInfo CalleeFunc) {
2769	auto CI = CallerCall.call().dyn_cast<CallsiteInfo >();
2770	assert(CI &&
2771	"Caller cannot be an allocation which should not have profiled calls");
2772	assert(CI->Clones.size() > CallerCall.cloneNo());
2773	CI->Clones [CallerCall.cloneNo()] = CalleeFunc.cloneNo();
2774	}
2775
2776	CallsiteContextGraph<ModuleCallsiteContextGraph, Function,
2777	Instruction *>::FuncInfo
2778	ModuleCallsiteContextGraph::cloneFunctionForCallsite(
2779	FuncInfo &Func, CallInfo &Call, std::map<CallInfo, CallInfo> &CallMap,
2780	std::vector<CallInfo> &CallsWithMetadataInFunc, unsigned CloneNo) {
2781	// Use existing LLVM facilities for cloning and obtaining Call in clone
2782	ValueToValueMapTy VMap;
2783	auto *NewFunc = CloneFunction(F: Func.func(), VMap);
2784	std::string Name = getMemProfFuncName(Base: Func.func()->getName(), CloneNo);
2785	assert(!Func.func()->getParent()->getFunction(Name));
2786	NewFunc->setName(Name);
2787	for (auto &Inst : CallsWithMetadataInFunc) {
2788	// This map always has the initial version in it.
2789	assert(Inst.cloneNo() == `0`);
2790	CallMap [Inst] = {cast<Instruction>(Val&: VMap [Inst.call()]), CloneNo};
2791	}
2792	OREGetter (Func.func())
2793	.emit(OptDiag&: OptimizationRemark (DEBUG_TYPE, "MemprofClone", Func.func())
2794	<< "created clone " << ore::NV ("NewFunction", NewFunc));
2795	return {NewFunc, CloneNo};
2796	}
2797
2798	CallsiteContextGraph<IndexCallsiteContextGraph, FunctionSummary,
2799	IndexCall>::FuncInfo
2800	IndexCallsiteContextGraph::cloneFunctionForCallsite(
2801	FuncInfo &Func, CallInfo &Call, std::map<CallInfo, CallInfo> &CallMap,
2802	std::vector<CallInfo> &CallsWithMetadataInFunc, unsigned CloneNo) {
2803	// Check how many clones we have of Call (and therefore function).
2804	// The next clone number is the current size of versions array.
2805	// Confirm this matches the CloneNo provided by the caller, which is based on
2806	// the number of function clones we have.
2807	assert(CloneNo ==
2808	(Call.call().is<AllocInfo *>()
2809	? Call.call().dyn_cast<AllocInfo *>()->Versions.size()
2810	: Call.call().dyn_cast<CallsiteInfo *>()->Clones.size()));
2811	// Walk all the instructions in this function. Create a new version for
2812	// each (by adding an entry to the Versions/Clones summary array), and copy
2813	// over the version being called for the function clone being cloned here.
2814	// Additionally, add an entry to the CallMap for the new function clone,
2815	// mapping the original call (clone 0, what is in CallsWithMetadataInFunc)
2816	// to the new call clone.
2817	for (auto &Inst : CallsWithMetadataInFunc) {
2818	// This map always has the initial version in it.
2819	assert(Inst.cloneNo() == `0`);
2820	if (auto AI = Inst.call().dyn_cast<AllocInfo >()) {
2821	assert(AI->Versions.size() == CloneNo);
2822	// We assign the allocation type later (in updateAllocationCall), just add
2823	// an entry for it here.
2824	AI->Versions.push_back(Elt: `0`);
2825	} else {
2826	auto CI = Inst.call().dyn_cast<CallsiteInfo >();
2827	assert(CI && CI->Clones.size() == CloneNo);
2828	// We assign the clone number later (in updateCall), just add an entry for
2829	// it here.
2830	CI->Clones.push_back(Elt: `0`);
2831	}
2832	CallMap [Inst] = {Inst.call(), CloneNo};
2833	}
2834	return {Func.func(), CloneNo};
2835	}
2836
2837	// This method assigns cloned callsites to functions, cloning the functions as
2838	// needed. The assignment is greedy and proceeds roughly as follows:
2839	//
2840	// For each function Func:
2841	// For each call with graph Node having clones:
2842	// Initialize ClonesWorklist to Node and its clones
2843	// Initialize NodeCloneCount to 0
2844	// While ClonesWorklist is not empty:
2845	// Clone = pop front ClonesWorklist
2846	// NodeCloneCount++
2847	// If Func has been cloned less than NodeCloneCount times:
2848	// If NodeCloneCount is 1:
2849	// Assign Clone to original Func
2850	// Continue
2851	// Create a new function clone
2852	// If other callers not assigned to call a function clone yet:
2853	// Assign them to call new function clone
2854	// Continue
2855	// Assign any other caller calling the cloned version to new clone
2856	//
2857	// For each caller of Clone:
2858	// If caller is assigned to call a specific function clone:
2859	// If we cannot assign Clone to that function clone:
2860	// Create new callsite Clone NewClone
2861	// Add NewClone to ClonesWorklist
2862	// Continue
2863	// Assign Clone to existing caller's called function clone
2864	// Else:
2865	// If Clone not already assigned to a function clone:
2866	// Assign to first function clone without assignment
2867	// Assign caller to selected function clone
2868	template <typename DerivedCCG, typename FuncTy, typename CallTy>
2869	bool CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::assignFunctions() {
2870	bool Changed = false;
2871
2872	// Keep track of the assignment of nodes (callsites) to function clones they
2873	// call.
2874	DenseMap<ContextNode *, FuncInfo> CallsiteToCalleeFuncCloneMap;
2875
2876	// Update caller node to call function version CalleeFunc, by recording the
2877	// assignment in CallsiteToCalleeFuncCloneMap.
2878	auto RecordCalleeFuncOfCallsite = [&](ContextNode *Caller,
2879	const FuncInfo &CalleeFunc) {
2880	assert(Caller->hasCall());
2881	CallsiteToCalleeFuncCloneMap[Caller] = CalleeFunc;
2882	};
2883
2884	// Walk all functions for which we saw calls with memprof metadata, and handle
2885	// cloning for each of its calls.
2886	for (auto &[Func, CallsWithMetadata] : FuncToCallsWithMetadata) {
2887	FuncInfo OrigFunc(Func);
2888	// Map from each clone of OrigFunc to a map of remappings of each call of
2889	// interest (from original uncloned call to the corresponding cloned call in
2890	// that function clone).
2891	std::map<FuncInfo, std::map<CallInfo, CallInfo>> FuncClonesToCallMap;
2892	for (auto &Call : CallsWithMetadata) {
2893	ContextNode *Node = getNodeForInst(C: Call);
2894	// Skip call if we do not have a node for it (all uses of its stack ids
2895	// were either on inlined chains or pruned from the MIBs), or if we did
2896	// not create any clones for it.
2897	if (!Node \|\| Node->Clones.empty())
2898	continue;
2899	assert(Node->hasCall() &&
2900	"Not having a call should have prevented cloning");
2901
2902	// Track the assignment of function clones to clones of the current
2903	// callsite Node being handled.
2904	std::map<FuncInfo, ContextNode *> FuncCloneToCurNodeCloneMap;
2905
2906	// Assign callsite version CallsiteClone to function version FuncClone,
2907	// and also assign (possibly cloned) Call to CallsiteClone.
2908	auto AssignCallsiteCloneToFuncClone = [&](const FuncInfo &FuncClone,
2909	CallInfo &Call,
2910	ContextNode *CallsiteClone,
2911	bool IsAlloc) {
2912	// Record the clone of callsite node assigned to this function clone.
2913	FuncCloneToCurNodeCloneMap[FuncClone] = CallsiteClone;
2914
2915	assert(FuncClonesToCallMap.count(FuncClone));
2916	std::map<CallInfo, CallInfo> &CallMap = FuncClonesToCallMap[FuncClone];
2917	CallInfo CallClone(Call);
2918	if (CallMap.count(Call))
2919	CallClone = CallMap[Call];
2920	CallsiteClone->setCall(CallClone);
2921	};
2922
2923	// Keep track of the clones of callsite Node that need to be assigned to
2924	// function clones. This list may be expanded in the loop body below if we
2925	// find additional cloning is required.
2926	std::deque<ContextNode *> ClonesWorklist;
2927	// Ignore original Node if we moved all of its contexts to clones.
2928	if (!Node->ContextIds.empty())
2929	ClonesWorklist.push_back(Node);
2930	ClonesWorklist.insert(ClonesWorklist.end(), Node->Clones.begin(),
2931	Node->Clones.end());
2932
2933	// Now walk through all of the clones of this callsite Node that we need,
2934	// and determine the assignment to a corresponding clone of the current
2935	// function (creating new function clones as needed).
2936	unsigned NodeCloneCount = `0`;
2937	while (!ClonesWorklist.empty()) {
2938	ContextNode *Clone = ClonesWorklist.front();
2939	ClonesWorklist.pop_front();
2940	NodeCloneCount++;
2941	if (VerifyNodes)
2942	checkNode<DerivedCCG, FuncTy, CallTy>(Clone);
2943
2944	// Need to create a new function clone if we have more callsite clones
2945	// than existing function clones, which would have been assigned to an
2946	// earlier clone in the list (we assign callsite clones to function
2947	// clones greedily).
2948	if (FuncClonesToCallMap.size() < NodeCloneCount) {
2949	// If this is the first callsite copy, assign to original function.
2950	if (NodeCloneCount == `1`) {
2951	// Since FuncClonesToCallMap is empty in this case, no clones have
2952	// been created for this function yet, and no callers should have
2953	// been assigned a function clone for this callee node yet.
2954	assert(llvm::none_of(
2955	Clone->CallerEdges, [&](const std::shared_ptr<ContextEdge> &E) {
2956	return CallsiteToCalleeFuncCloneMap.count(E->Caller);
2957	}));
2958	// Initialize with empty call map, assign Clone to original function
2959	// and its callers, and skip to the next clone.
2960	FuncClonesToCallMap[OrigFunc] = {};
2961	AssignCallsiteCloneToFuncClone(
2962	OrigFunc, Call, Clone,
2963	AllocationCallToContextNodeMap.count(Call));
2964	for (auto &CE : Clone->CallerEdges) {
2965	// Ignore any caller that does not have a recorded callsite Call.
2966	if (!CE->Caller->hasCall())
2967	continue;
2968	RecordCalleeFuncOfCallsite(CE->Caller, OrigFunc);
2969	}
2970	continue;
2971	}
2972
2973	// First locate which copy of OrigFunc to clone again. If a caller
2974	// of this callsite clone was already assigned to call a particular
2975	// function clone, we need to redirect all of those callers to the
2976	// new function clone, and update their other callees within this
2977	// function.
2978	FuncInfo PreviousAssignedFuncClone;
2979	auto EI = llvm::find_if(
2980	Clone->CallerEdges, [&](const std::shared_ptr<ContextEdge> &E) {
2981	return CallsiteToCalleeFuncCloneMap.count(E->Caller);
2982	});
2983	bool CallerAssignedToCloneOfFunc = false;
2984	if (EI != Clone->CallerEdges.end()) {
2985	const std::shared_ptr<ContextEdge> &Edge = *EI;
2986	PreviousAssignedFuncClone =
2987	CallsiteToCalleeFuncCloneMap[Edge->Caller];
2988	CallerAssignedToCloneOfFunc = true;
2989	}
2990
2991	// Clone function and save it along with the CallInfo map created
2992	// during cloning in the FuncClonesToCallMap.
2993	std::map<CallInfo, CallInfo> NewCallMap;
2994	unsigned CloneNo = FuncClonesToCallMap.size();
2995	assert(CloneNo > `0` && "Clone 0 is the original function, which "
2996	"should already exist in the map");
2997	FuncInfo NewFuncClone = cloneFunctionForCallsite(
2998	Func&: OrigFunc, Call, CallMap&: NewCallMap, CallsWithMetadataInFunc&: CallsWithMetadata, CloneNo);
2999	FuncClonesToCallMap.emplace(NewFuncClone, std::move(NewCallMap));
3000	FunctionClonesAnalysis ++;
3001	Changed = true;
3002
3003	// If no caller callsites were already assigned to a clone of this
3004	// function, we can simply assign this clone to the new func clone
3005	// and update all callers to it, then skip to the next clone.
3006	if (!CallerAssignedToCloneOfFunc) {
3007	AssignCallsiteCloneToFuncClone(
3008	NewFuncClone, Call, Clone,
3009	AllocationCallToContextNodeMap.count(Call));
3010	for (auto &CE : Clone->CallerEdges) {
3011	// Ignore any caller that does not have a recorded callsite Call.
3012	if (!CE->Caller->hasCall())
3013	continue;
3014	RecordCalleeFuncOfCallsite(CE->Caller, NewFuncClone);
3015	}
3016	continue;
3017	}
3018
3019	// We may need to do additional node cloning in this case.
3020	// Reset the CallsiteToCalleeFuncCloneMap entry for any callers
3021	// that were previously assigned to call PreviousAssignedFuncClone,
3022	// to record that they now call NewFuncClone.
3023	for (auto CE : Clone->CallerEdges) {
3024	// Skip any that have been removed on an earlier iteration.
3025	if (!CE)
3026	continue;
3027	// Ignore any caller that does not have a recorded callsite Call.
3028	if (!CE->Caller->hasCall())
3029	continue;
3030
3031	if (!CallsiteToCalleeFuncCloneMap.count(CE->Caller) \|\|
3032	// We subsequently fall through to later handling that
3033	// will perform any additional cloning required for
3034	// callers that were calling other function clones.
3035	CallsiteToCalleeFuncCloneMap[CE->Caller] !=
3036	PreviousAssignedFuncClone)
3037	continue;
3038
3039	RecordCalleeFuncOfCallsite(CE->Caller, NewFuncClone);
3040
3041	// If we are cloning a function that was already assigned to some
3042	// callers, then essentially we are creating new callsite clones
3043	// of the other callsites in that function that are reached by those
3044	// callers. Clone the other callees of the current callsite's caller
3045	// that were already assigned to PreviousAssignedFuncClone
3046	// accordingly. This is important since we subsequently update the
3047	// calls from the nodes in the graph and their assignments to callee
3048	// functions recorded in CallsiteToCalleeFuncCloneMap.
3049	for (auto CalleeEdge : CE->Caller->CalleeEdges) {
3050	// Skip any that have been removed on an earlier iteration when
3051	// cleaning up newly None type callee edges.
3052	if (!CalleeEdge)
3053	continue;
3054	ContextNode *Callee = CalleeEdge->Callee;
3055	// Skip the current callsite, we are looking for other
3056	// callsites Caller calls, as well as any that does not have a
3057	// recorded callsite Call.
3058	if (Callee == Clone \|\| !Callee->hasCall())
3059	continue;
3060	ContextNode *NewClone = moveEdgeToNewCalleeClone(Edge: CalleeEdge);
3061	removeNoneTypeCalleeEdges(Node: NewClone);
3062	// Moving the edge may have resulted in some none type
3063	// callee edges on the original Callee.
3064	removeNoneTypeCalleeEdges(Node: Callee);
3065	assert(NewClone->AllocTypes != (uint8_t)AllocationType::None);
3066	// If the Callee node was already assigned to call a specific
3067	// function version, make sure its new clone is assigned to call
3068	// that same function clone.
3069	if (CallsiteToCalleeFuncCloneMap.count(Callee))
3070	RecordCalleeFuncOfCallsite(
3071	NewClone, CallsiteToCalleeFuncCloneMap[Callee]);
3072	// Update NewClone with the new Call clone of this callsite's Call
3073	// created for the new function clone created earlier.
3074	// Recall that we have already ensured when building the graph
3075	// that each caller can only call callsites within the same
3076	// function, so we are guaranteed that Callee Call is in the
3077	// current OrigFunc.
3078	// CallMap is set up as indexed by original Call at clone 0.
3079	CallInfo OrigCall(Callee->getOrigNode()->Call);
3080	OrigCall.setCloneNo(`0`);
3081	std::map<CallInfo, CallInfo> &CallMap =
3082	FuncClonesToCallMap[NewFuncClone];
3083	assert(CallMap.count(OrigCall));
3084	CallInfo NewCall(CallMap[OrigCall]);
3085	assert(NewCall);
3086	NewClone->setCall(NewCall);
3087	}
3088	}
3089	// Fall through to handling below to perform the recording of the
3090	// function for this callsite clone. This enables handling of cases
3091	// where the callers were assigned to different clones of a function.
3092	}
3093
3094	// See if we can use existing function clone. Walk through
3095	// all caller edges to see if any have already been assigned to
3096	// a clone of this callsite's function. If we can use it, do so. If not,
3097	// because that function clone is already assigned to a different clone
3098	// of this callsite, then we need to clone again.
3099	// Basically, this checking is needed to handle the case where different
3100	// caller functions/callsites may need versions of this function
3101	// containing different mixes of callsite clones across the different
3102	// callsites within the function. If that happens, we need to create
3103	// additional function clones to handle the various combinations.
3104	//
3105	// Keep track of any new clones of this callsite created by the
3106	// following loop, as well as any existing clone that we decided to
3107	// assign this clone to.
3108	std::map<FuncInfo, ContextNode *> FuncCloneToNewCallsiteCloneMap;
3109	FuncInfo FuncCloneAssignedToCurCallsiteClone;
3110	// We need to be able to remove Edge from CallerEdges, so need to adjust
3111	// iterator in the loop.
3112	for (auto EI = Clone->CallerEdges.begin();
3113	EI != Clone->CallerEdges.end();) {
3114	auto Edge = *EI;
3115	// Ignore any caller that does not have a recorded callsite Call.
3116	if (!Edge->Caller->hasCall()) {
3117	EI++;
3118	continue;
3119	}
3120	// If this caller already assigned to call a version of OrigFunc, need
3121	// to ensure we can assign this callsite clone to that function clone.
3122	if (CallsiteToCalleeFuncCloneMap.count(Edge->Caller)) {
3123	FuncInfo FuncCloneCalledByCaller =
3124	CallsiteToCalleeFuncCloneMap[Edge->Caller];
3125	// First we need to confirm that this function clone is available
3126	// for use by this callsite node clone.
3127	//
3128	// While FuncCloneToCurNodeCloneMap is built only for this Node and
3129	// its callsite clones, one of those callsite clones X could have
3130	// been assigned to the same function clone called by Edge's caller
3131	// - if Edge's caller calls another callsite within Node's original
3132	// function, and that callsite has another caller reaching clone X.
3133	// We need to clone Node again in this case.
3134	if ((FuncCloneToCurNodeCloneMap.count(FuncCloneCalledByCaller) &&
3135	FuncCloneToCurNodeCloneMap[FuncCloneCalledByCaller] !=
3136	Clone) \|\|
3137	// Detect when we have multiple callers of this callsite that
3138	// have already been assigned to specific, and different, clones
3139	// of OrigFunc (due to other unrelated callsites in Func they
3140	// reach via call contexts). Is this Clone of callsite Node
3141	// assigned to a different clone of OrigFunc? If so, clone Node
3142	// again.
3143	(FuncCloneAssignedToCurCallsiteClone &&
3144	FuncCloneAssignedToCurCallsiteClone !=
3145	FuncCloneCalledByCaller)) {
3146	// We need to use a different newly created callsite clone, in
3147	// order to assign it to another new function clone on a
3148	// subsequent iteration over the Clones array (adjusted below).
3149	// Note we specifically do not reset the
3150	// CallsiteToCalleeFuncCloneMap entry for this caller, so that
3151	// when this new clone is processed later we know which version of
3152	// the function to copy (so that other callsite clones we have
3153	// assigned to that function clone are properly cloned over). See
3154	// comments in the function cloning handling earlier.
3155
3156	// Check if we already have cloned this callsite again while
3157	// walking through caller edges, for a caller calling the same
3158	// function clone. If so, we can move this edge to that new clone
3159	// rather than creating yet another new clone.
3160	if (FuncCloneToNewCallsiteCloneMap.count(
3161	FuncCloneCalledByCaller)) {
3162	ContextNode *NewClone =
3163	FuncCloneToNewCallsiteCloneMap[FuncCloneCalledByCaller];
3164	moveEdgeToExistingCalleeClone(Edge, NewCallee: NewClone, CallerEdgeI: &EI);
3165	// Cleanup any none type edges cloned over.
3166	removeNoneTypeCalleeEdges(Node: NewClone);
3167	} else {
3168	// Create a new callsite clone.
3169	ContextNode *NewClone = moveEdgeToNewCalleeClone(Edge, CallerEdgeI: &EI);
3170	removeNoneTypeCalleeEdges(Node: NewClone);
3171	FuncCloneToNewCallsiteCloneMap[FuncCloneCalledByCaller] =
3172	NewClone;
3173	// Add to list of clones and process later.
3174	ClonesWorklist.push_back(NewClone);
3175	assert(EI == Clone->CallerEdges.end() \|\|
3176	Clone->AllocTypes != (uint8_t)AllocationType::None);
3177	assert(NewClone->AllocTypes != (uint8_t)AllocationType::None);
3178	}
3179	// Moving the caller edge may have resulted in some none type
3180	// callee edges.
3181	removeNoneTypeCalleeEdges(Node: Clone);
3182	// We will handle the newly created callsite clone in a subsequent
3183	// iteration over this Node's Clones. Continue here since we
3184	// already adjusted iterator EI while moving the edge.
3185	continue;
3186	}
3187
3188	// Otherwise, we can use the function clone already assigned to this
3189	// caller.
3190	if (!FuncCloneAssignedToCurCallsiteClone) {
3191	FuncCloneAssignedToCurCallsiteClone = FuncCloneCalledByCaller;
3192	// Assign Clone to FuncCloneCalledByCaller
3193	AssignCallsiteCloneToFuncClone(
3194	FuncCloneCalledByCaller, Call, Clone,
3195	AllocationCallToContextNodeMap.count(Call));
3196	} else
3197	// Don't need to do anything - callsite is already calling this
3198	// function clone.
3199	assert(FuncCloneAssignedToCurCallsiteClone ==
3200	FuncCloneCalledByCaller);
3201
3202	} else {
3203	// We have not already assigned this caller to a version of
3204	// OrigFunc. Do the assignment now.
3205
3206	// First check if we have already assigned this callsite clone to a
3207	// clone of OrigFunc for another caller during this iteration over
3208	// its caller edges.
3209	if (!FuncCloneAssignedToCurCallsiteClone) {
3210	// Find first function in FuncClonesToCallMap without an assigned
3211	// clone of this callsite Node. We should always have one
3212	// available at this point due to the earlier cloning when the
3213	// FuncClonesToCallMap size was smaller than the clone number.
3214	for (auto &CF : FuncClonesToCallMap) {
3215	if (!FuncCloneToCurNodeCloneMap.count(CF.first)) {
3216	FuncCloneAssignedToCurCallsiteClone = CF.first;
3217	break;
3218	}
3219	}
3220	assert(FuncCloneAssignedToCurCallsiteClone);
3221	// Assign Clone to FuncCloneAssignedToCurCallsiteClone
3222	AssignCallsiteCloneToFuncClone(
3223	FuncCloneAssignedToCurCallsiteClone, Call, Clone,
3224	AllocationCallToContextNodeMap.count(Call));
3225	} else
3226	assert(FuncCloneToCurNodeCloneMap
3227	[FuncCloneAssignedToCurCallsiteClone] == Clone);
3228	// Update callers to record function version called.
3229	RecordCalleeFuncOfCallsite(Edge->Caller,
3230	FuncCloneAssignedToCurCallsiteClone);
3231	}
3232
3233	EI++;
3234	}
3235	}
3236	if (VerifyCCG) {
3237	checkNode<DerivedCCG, FuncTy, CallTy>(Node);
3238	for (const auto &PE : Node->CalleeEdges)
3239	checkNode<DerivedCCG, FuncTy, CallTy>(PE->Callee);
3240	for (const auto &CE : Node->CallerEdges)
3241	checkNode<DerivedCCG, FuncTy, CallTy>(CE->Caller);
3242	for (auto *Clone : Node->Clones) {
3243	checkNode<DerivedCCG, FuncTy, CallTy>(Clone);
3244	for (const auto &PE : Clone->CalleeEdges)
3245	checkNode<DerivedCCG, FuncTy, CallTy>(PE->Callee);
3246	for (const auto &CE : Clone->CallerEdges)
3247	checkNode<DerivedCCG, FuncTy, CallTy>(CE->Caller);
3248	}
3249	}
3250	}
3251	}
3252
3253	auto UpdateCalls = [&](ContextNode *Node,
3254	DenseSet<const ContextNode *> &Visited,
3255	auto &&UpdateCalls) {
3256	auto Inserted = Visited.insert(Node);
3257	if (!Inserted.second)
3258	return;
3259
3260	for (auto *Clone : Node->Clones)
3261	UpdateCalls(Clone, Visited, UpdateCalls);
3262
3263	for (auto &Edge : Node->CallerEdges)
3264	UpdateCalls(Edge->Caller, Visited, UpdateCalls);
3265
3266	// Skip if either no call to update, or if we ended up with no context ids
3267	// (we moved all edges onto other clones).
3268	if (!Node->hasCall() \|\| Node->ContextIds.empty())
3269	return;
3270
3271	if (Node->IsAllocation) {
3272	updateAllocationCall(Call&: Node->Call, AllocType: allocTypeToUse(Node->AllocTypes));
3273	return;
3274	}
3275
3276	if (!CallsiteToCalleeFuncCloneMap.count(Node))
3277	return;
3278
3279	auto CalleeFunc = CallsiteToCalleeFuncCloneMap[Node];
3280	updateCall(CallerCall&: Node->Call, CalleeFunc);
3281	};
3282
3283	// Performs DFS traversal starting from allocation nodes to update calls to
3284	// reflect cloning decisions recorded earlier. For regular LTO this will
3285	// update the actual calls in the IR to call the appropriate function clone
3286	// (and add attributes to allocation calls), whereas for ThinLTO the decisions
3287	// are recorded in the summary entries.
3288	DenseSet<const ContextNode *> Visited;
3289	for (auto &Entry : AllocationCallToContextNodeMap)
3290	UpdateCalls(Entry.second, Visited, UpdateCalls);
3291
3292	return Changed;
3293	}
3294
3295	static SmallVector<std::unique_ptr<ValueToValueMapTy>, `4`> createFunctionClones(
3296	Function &F, unsigned NumClones, Module &M, OptimizationRemarkEmitter &ORE,
3297	std::map<const Function , SmallPtrSet<const* GlobalAlias *, `1`>>
3298	&FuncToAliasMap) {
3299	// The first "clone" is the original copy, we should only call this if we
3300	// needed to create new clones.
3301	assert(NumClones > `1`);
3302	SmallVector<std::unique_ptr<ValueToValueMapTy>, `4`> VMaps;
3303	VMaps.reserve(N: NumClones - `1`);
3304	FunctionsClonedThinBackend ++;
3305	for (unsigned I = `1`; I < NumClones; I++) {
3306	VMaps.emplace_back(Args: std::make_unique<ValueToValueMapTy>());
3307	auto NewF = CloneFunction(F: &F, VMap&: VMaps.back());
3308	FunctionClonesThinBackend ++;
3309	// Strip memprof and callsite metadata from clone as they are no longer
3310	// needed.
3311	for (auto &BB : *NewF) {
3312	for (auto &Inst : BB) {
3313	Inst.setMetadata(KindID: LLVMContext::MD_memprof, Node: nullptr);
3314	Inst.setMetadata(KindID: LLVMContext::MD_callsite, Node: nullptr);
3315	}
3316	}
3317	std::string Name = getMemProfFuncName(Base: F.getName(), CloneNo: I);
3318	auto *PrevF = M.getFunction(Name);
3319	if (PrevF) {
3320	// We might have created this when adjusting callsite in another
3321	// function. It should be a declaration.
3322	assert(PrevF->isDeclaration());
3323	NewF->takeName(V: PrevF);
3324	PrevF->replaceAllUsesWith(V: NewF);
3325	PrevF->eraseFromParent();
3326	} else
3327	NewF->setName(Name);
3328	ORE.emit(OptDiag&: OptimizationRemark (DEBUG_TYPE, "MemprofClone", &F)
3329	<< "created clone " << ore::NV ("NewFunction", NewF));
3330
3331	// Now handle aliases to this function, and clone those as well.
3332	if (!FuncToAliasMap.count(x: &F))
3333	continue;
3334	for (auto *A : FuncToAliasMap [&F]) {
3335	std::string Name = getMemProfFuncName(Base: A->getName(), CloneNo: I);
3336	auto *PrevA = M.getNamedAlias(Name);
3337	auto *NewA = GlobalAlias::create(Ty: A->getValueType(),
3338	AddressSpace: A->getType()->getPointerAddressSpace(),
3339	Linkage: A->getLinkage(), Name, Aliasee: NewF);
3340	NewA->copyAttributesFrom(Src: A);
3341	if (PrevA) {
3342	// We might have created this when adjusting callsite in another
3343	// function. It should be a declaration.
3344	assert(PrevA->isDeclaration());
3345	NewA->takeName(V: PrevA);
3346	PrevA->replaceAllUsesWith(V: NewA);
3347	PrevA->eraseFromParent();
3348	}
3349	}
3350	}
3351	return VMaps;
3352	}
3353
3354	// Locate the summary for F. This is complicated by the fact that it might
3355	// have been internalized or promoted.
3356	static ValueInfo findValueInfoForFunc(const Function &F, const Module &M,
3357	const ModuleSummaryIndex *ImportSummary) {
3358	// FIXME: Ideally we would retain the original GUID in some fashion on the
3359	// function (e.g. as metadata), but for now do our best to locate the
3360	// summary without that information.
3361	ValueInfo TheFnVI = ImportSummary->getValueInfo(GUID: F.getGUID());
3362	if (!TheFnVI)
3363	// See if theFn was internalized, by checking index directly with
3364	// original name (this avoids the name adjustment done by getGUID() for
3365	// internal symbols).
3366	TheFnVI = ImportSummary->getValueInfo(GUID: GlobalValue::getGUID(GlobalName: F.getName()));
3367	if (TheFnVI)
3368	return TheFnVI;
3369	// Now query with the original name before any promotion was performed.
3370	StringRef OrigName =
3371	ModuleSummaryIndex::getOriginalNameBeforePromote(Name: F.getName());
3372	std::string OrigId = GlobalValue::getGlobalIdentifier(
3373	Name: OrigName, Linkage: GlobalValue::InternalLinkage, FileName: M.getSourceFileName());
3374	TheFnVI = ImportSummary->getValueInfo(GUID: GlobalValue::getGUID(GlobalName: OrigId));
3375	if (TheFnVI)
3376	return TheFnVI;
3377	// Could be a promoted local imported from another module. We need to pass
3378	// down more info here to find the original module id. For now, try with
3379	// the OrigName which might have been stored in the OidGuidMap in the
3380	// index. This would not work if there were same-named locals in multiple
3381	// modules, however.
3382	auto OrigGUID =
3383	ImportSummary->getGUIDFromOriginalID(OriginalID: GlobalValue::getGUID(GlobalName: OrigName));
3384	if (OrigGUID)
3385	TheFnVI = ImportSummary->getValueInfo(GUID: OrigGUID);
3386	return TheFnVI;
3387	}
3388
3389	bool MemProfContextDisambiguation::applyImport(Module &M) {
3390	assert(ImportSummary);
3391	bool Changed = false;
3392
3393	auto IsMemProfClone = [](const Function &F) {
3394	return F.getName().contains(Other: MemProfCloneSuffix);
3395	};
3396
3397	// We also need to clone any aliases that reference cloned functions, because
3398	// the modified callsites may invoke via the alias. Keep track of the aliases
3399	// for each function.
3400	std::map<const Function , SmallPtrSet<const* GlobalAlias *, `1`>>
3401	FuncToAliasMap;
3402	for (auto &A : M.aliases()) {
3403	auto *Aliasee = A.getAliaseeObject();
3404	if (auto *F = dyn_cast<Function>(Val: Aliasee))
3405	FuncToAliasMap [F].insert(Ptr: &A);
3406	}
3407
3408	for (auto &F : M) {
3409	if (F.isDeclaration() \|\| IsMemProfClone (F))
3410	continue;
3411
3412	OptimizationRemarkEmitter ORE(&F);
3413
3414	SmallVector<std::unique_ptr<ValueToValueMapTy>, `4`> VMaps;
3415	bool ClonesCreated = false;
3416	unsigned NumClonesCreated = `0`;
3417	auto CloneFuncIfNeeded = [&](unsigned NumClones) {
3418	// We should at least have version 0 which is the original copy.
3419	assert(NumClones > `0`);
3420	// If only one copy needed use original.
3421	if (NumClones == `1`)
3422	return;
3423	// If we already performed cloning of this function, confirm that the
3424	// requested number of clones matches (the thin link should ensure the
3425	// number of clones for each constituent callsite is consistent within
3426	// each function), before returning.
3427	if (ClonesCreated) {
3428	assert(NumClonesCreated == NumClones);
3429	return;
3430	}
3431	VMaps = createFunctionClones(F, NumClones, M, ORE, FuncToAliasMap);
3432	// The first "clone" is the original copy, which doesn't have a VMap.
3433	assert(VMaps.size() == NumClones - `1`);
3434	Changed = true;
3435	ClonesCreated = true;
3436	NumClonesCreated = NumClones;
3437	};
3438
3439	auto CloneCallsite = [&](const CallsiteInfo &StackNode, CallBase *CB,
3440	Function *CalledFunction) {
3441	// Perform cloning if not yet done.
3442	CloneFuncIfNeeded (/NumClones=/StackNode.Clones.size());
3443
3444	// Should have skipped indirect calls via mayHaveMemprofSummary.
3445	assert(CalledFunction);
3446	assert(!IsMemProfClone(*CalledFunction));
3447
3448	// Update the calls per the summary info.
3449	// Save orig name since it gets updated in the first iteration
3450	// below.
3451	auto CalleeOrigName = CalledFunction->getName();
3452	for (unsigned J = `0`; J < StackNode.Clones.size(); J++) {
3453	// Do nothing if this version calls the original version of its
3454	// callee.
3455	if (!StackNode.Clones [J])
3456	continue;
3457	auto NewF = M.getOrInsertFunction(
3458	Name: getMemProfFuncName(Base: CalleeOrigName, CloneNo: StackNode.Clones [J]),
3459	T: CalledFunction->getFunctionType());
3460	CallBase *CBClone;
3461	// Copy 0 is the original function.
3462	if (!J)
3463	CBClone = CB;
3464	else
3465	CBClone = cast<CallBase>(Val&: (*VMaps [J - `1`])[CB]);
3466	CBClone->setCalledFunction(NewF);
3467	ORE.emit(OptDiag&: OptimizationRemark (DEBUG_TYPE, "MemprofCall", CBClone)
3468	<< ore::NV ("Call", CBClone) << " in clone "
3469	<< ore::NV ("Caller", CBClone->getFunction())
3470	<< " assigned to call function clone "
3471	<< ore::NV ("Callee", NewF.getCallee()));
3472	}
3473	};
3474
3475	// Locate the summary for F.
3476	ValueInfo TheFnVI = findValueInfoForFunc(F, M, ImportSummary);
3477	// If not found, this could be an imported local (see comment in
3478	// findValueInfoForFunc). Skip for now as it will be cloned in its original
3479	// module (where it would have been promoted to global scope so should
3480	// satisfy any reference in this module).
3481	if (!TheFnVI)
3482	continue;
3483
3484	auto *GVSummary =
3485	ImportSummary->findSummaryInModule(VI: TheFnVI, ModuleId: M.getModuleIdentifier());
3486	if (!GVSummary) {
3487	// Must have been imported, use the summary which matches the definition。
3488	// (might be multiple if this was a linkonce_odr).
3489	auto SrcModuleMD = F.getMetadata(Kind: "thinlto_src_module");
3490	assert(SrcModuleMD &&
3491	"enable-import-metadata is needed to emit thinlto_src_module");
3492	StringRef SrcModule =
3493	dyn_cast<MDString>(Val: SrcModuleMD->getOperand(I: `0`))->getString();
3494	for (auto &GVS : TheFnVI.getSummaryList()) {
3495	if (GVS ->modulePath() == SrcModule) {
3496	GVSummary = GVS.get();
3497	break;
3498	}
3499	}
3500	assert(GVSummary && GVSummary->modulePath() == SrcModule);
3501	}
3502
3503	// If this was an imported alias skip it as we won't have the function
3504	// summary, and it should be cloned in the original module.
3505	if (isa<AliasSummary>(Val: GVSummary))
3506	continue;
3507
3508	auto *FS = cast<FunctionSummary>(Val: GVSummary->getBaseObject());
3509
3510	if (FS->allocs().empty() && FS->callsites().empty())
3511	continue;
3512
3513	auto SI = FS->callsites().begin();
3514	auto AI = FS->allocs().begin();
3515
3516	// To handle callsite infos synthesized for tail calls which have missing
3517	// frames in the profiled context, map callee VI to the synthesized callsite
3518	// info.
3519	DenseMap<ValueInfo, CallsiteInfo> MapTailCallCalleeVIToCallsite;
3520	// Iterate the callsites for this function in reverse, since we place all
3521	// those synthesized for tail calls at the end.
3522	for (auto CallsiteIt = FS->callsites().rbegin();
3523	CallsiteIt != FS->callsites().rend(); CallsiteIt ++) {
3524	auto &Callsite = *CallsiteIt;
3525	// Stop as soon as we see a non-synthesized callsite info (see comment
3526	// above loop). All the entries added for discovered tail calls have empty
3527	// stack ids.
3528	if (!Callsite.StackIdIndices.empty())
3529	break;
3530	MapTailCallCalleeVIToCallsite.insert(KV: {Callsite.Callee, Callsite});
3531	}
3532
3533	// Assume for now that the instructions are in the exact same order
3534	// as when the summary was created, but confirm this is correct by
3535	// matching the stack ids.
3536	for (auto &BB : F) {
3537	for (auto &I : BB) {
3538	auto *CB = dyn_cast<CallBase>(Val: &I);
3539	// Same handling as when creating module summary.
3540	if (!mayHaveMemprofSummary(CB))
3541	continue;
3542
3543	auto *CalledValue = CB->getCalledOperand();
3544	auto *CalledFunction = CB->getCalledFunction();
3545	if (CalledValue && !CalledFunction) {
3546	CalledValue = CalledValue->stripPointerCasts();
3547	// Stripping pointer casts can reveal a called function.
3548	CalledFunction = dyn_cast<Function>(Val: CalledValue);
3549	}
3550	// Check if this is an alias to a function. If so, get the
3551	// called aliasee for the checks below.
3552	if (auto *GA = dyn_cast<GlobalAlias>(Val: CalledValue)) {
3553	assert(!CalledFunction &&
3554	"Expected null called function in callsite for alias");
3555	CalledFunction = dyn_cast<Function>(Val: GA->getAliaseeObject());
3556	}
3557
3558	CallStack<MDNode, MDNode::op_iterator> CallsiteContext(
3559	I.getMetadata(KindID: LLVMContext::MD_callsite));
3560	auto *MemProfMD = I.getMetadata(KindID: LLVMContext::MD_memprof);
3561
3562	// Include allocs that were already assigned a memprof function
3563	// attribute in the statistics.
3564	if (CB->getAttributes().hasFnAttr(Kind: "memprof")) {
3565	assert(!MemProfMD);
3566	CB->getAttributes().getFnAttr(Kind: "memprof").getValueAsString() == "cold"
3567	? AllocTypeColdThinBackend ++
3568	: AllocTypeNotColdThinBackend ++;
3569	OrigAllocsThinBackend ++;
3570	AllocVersionsThinBackend ++;
3571	if (!MaxAllocVersionsThinBackend)
3572	MaxAllocVersionsThinBackend = `1`;
3573	// Remove any remaining callsite metadata and we can skip the rest of
3574	// the handling for this instruction, since no cloning needed.
3575	I.setMetadata(KindID: LLVMContext::MD_callsite, Node: nullptr);
3576	continue;
3577	}
3578
3579	if (MemProfMD) {
3580	// Consult the next alloc node.
3581	assert(AI != FS->allocs().end());
3582	auto &AllocNode = *(AI++);
3583
3584	// Sanity check that the MIB stack ids match between the summary and
3585	// instruction metadata.
3586	auto MIBIter = AllocNode.MIBs.begin();
3587	for (auto &MDOp : MemProfMD->operands()) {
3588	assert(MIBIter != AllocNode.MIBs.end());
3589	LLVM_ATTRIBUTE_UNUSED auto StackIdIndexIter =
3590	MIBIter ->StackIdIndices.begin();
3591	auto MIBMD = cast<const* MDNode>(Val: MDOp);
3592	MDNode *StackMDNode = getMIBStackNode(MIB: MIBMD);
3593	assert(StackMDNode);
3594	CallStack<MDNode, MDNode::op_iterator> StackContext(StackMDNode);
3595	auto ContextIterBegin =
3596	StackContext.beginAfterSharedPrefix(Other&: CallsiteContext);
3597	// Skip the checking on the first iteration.
3598	uint64_t LastStackContextId =
3599	(ContextIterBegin != StackContext.end() &&
3600	*ContextIterBegin == `0`)
3601	? `1`
3602	: `0`;
3603	for (auto ContextIter = ContextIterBegin;
3604	ContextIter != StackContext.end(); ++ContextIter) {
3605	// If this is a direct recursion, simply skip the duplicate
3606	// entries, to be consistent with how the summary ids were
3607	// generated during ModuleSummaryAnalysis.
3608	if (LastStackContextId == *ContextIter)
3609	continue;
3610	LastStackContextId = *ContextIter;
3611	assert(StackIdIndexIter != MIBIter ->StackIdIndices.end());
3612	assert(ImportSummary->getStackIdAtIndex(*StackIdIndexIter) ==
3613	*ContextIter);
3614	StackIdIndexIter++;
3615	}
3616	MIBIter ++;
3617	}
3618
3619	// Perform cloning if not yet done.
3620	CloneFuncIfNeeded (/NumClones=/AllocNode.Versions.size());
3621
3622	OrigAllocsThinBackend ++;
3623	AllocVersionsThinBackend += AllocNode.Versions.size();
3624	if (MaxAllocVersionsThinBackend < AllocNode.Versions.size())
3625	MaxAllocVersionsThinBackend = AllocNode.Versions.size();
3626
3627	// If there is only one version that means we didn't end up
3628	// considering this function for cloning, and in that case the alloc
3629	// will still be none type or should have gotten the default NotCold.
3630	// Skip that after calling clone helper since that does some sanity
3631	// checks that confirm we haven't decided yet that we need cloning.
3632	if (AllocNode.Versions.size() == `1`) {
3633	assert((AllocationType)AllocNode.Versions[`0`] ==
3634	AllocationType::NotCold \|\|
3635	(AllocationType)AllocNode.Versions[`0`] ==
3636	AllocationType::None);
3637	UnclonableAllocsThinBackend ++;
3638	continue;
3639	}
3640
3641	// All versions should have a singular allocation type.
3642	assert(llvm::none_of(AllocNode.Versions, [](uint8_t Type) {
3643	return Type == ((uint8_t)AllocationType::NotCold \|
3644	(uint8_t)AllocationType::Cold);
3645	}));
3646
3647	// Update the allocation types per the summary info.
3648	for (unsigned J = `0`; J < AllocNode.Versions.size(); J++) {
3649	// Ignore any that didn't get an assigned allocation type.
3650	if (AllocNode.Versions [J] == (uint8_t)AllocationType::None)
3651	continue;
3652	AllocationType AllocTy = (AllocationType)AllocNode.Versions [J];
3653	AllocTy == AllocationType::Cold ? AllocTypeColdThinBackend ++
3654	: AllocTypeNotColdThinBackend ++;
3655	std::string AllocTypeString = getAllocTypeAttributeString(Type: AllocTy);
3656	auto A = llvm::Attribute::get(Context&: F.getContext(), Kind: "memprof",
3657	Val: AllocTypeString);
3658	CallBase *CBClone;
3659	// Copy 0 is the original function.
3660	if (!J)
3661	CBClone = CB;
3662	else
3663	// Since VMaps are only created for new clones, we index with
3664	// clone J-1 (J==0 is the original clone and does not have a VMaps
3665	// entry).
3666	CBClone = cast<CallBase>(Val&: (*VMaps [J - `1`])[CB]);
3667	CBClone->addFnAttr(Attr: A);
3668	ORE.emit(OptDiag&: OptimizationRemark (DEBUG_TYPE, "MemprofAttribute", CBClone)
3669	<< ore::NV ("AllocationCall", CBClone) << " in clone "
3670	<< ore::NV ("Caller", CBClone->getFunction())
3671	<< " marked with memprof allocation attribute "
3672	<< ore::NV ("Attribute", AllocTypeString));
3673	}
3674	} else if (!CallsiteContext.empty()) {
3675	// Consult the next callsite node.
3676	assert(SI != FS->callsites().end());
3677	auto &StackNode = *(SI++);
3678
3679	#ifndef NDEBUG
3680	// Sanity check that the stack ids match between the summary and
3681	// instruction metadata.
3682	auto StackIdIndexIter = StackNode.StackIdIndices.begin();
3683	for (auto StackId : CallsiteContext) {
3684	assert(StackIdIndexIter != StackNode.StackIdIndices.end());
3685	assert(ImportSummary->getStackIdAtIndex(*StackIdIndexIter) ==
3686	StackId);
3687	StackIdIndexIter++;
3688	}
3689	#endif
3690
3691	CloneCallsite (StackNode, CB, CalledFunction);
3692	} else if (CB->isTailCall()) {
3693	// Locate the synthesized callsite info for the callee VI, if any was
3694	// created, and use that for cloning.
3695	ValueInfo CalleeVI =
3696	findValueInfoForFunc(F: *CalledFunction, M, ImportSummary);
3697	if (CalleeVI && MapTailCallCalleeVIToCallsite.count(Val: CalleeVI)) {
3698	auto Callsite = MapTailCallCalleeVIToCallsite.find(Val: CalleeVI);
3699	assert(Callsite != MapTailCallCalleeVIToCallsite.end());
3700	CloneCallsite (Callsite ->second, CB, CalledFunction);
3701	}
3702	}
3703	// Memprof and callsite metadata on memory allocations no longer needed.
3704	I.setMetadata(KindID: LLVMContext::MD_memprof, Node: nullptr);
3705	I.setMetadata(KindID: LLVMContext::MD_callsite, Node: nullptr);
3706	}
3707	}
3708	}
3709
3710	return Changed;
3711	}
3712
3713	template <typename DerivedCCG, typename FuncTy, typename CallTy>
3714	bool CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::process() {
3715	if (DumpCCG) {
3716	dbgs() << "CCG before cloning:\n";
3717	dbgs() << *this;
3718	}
3719	if (ExportToDot)
3720	exportToDot(Label: "postbuild");
3721
3722	if (VerifyCCG) {
3723	check();
3724	}
3725
3726	identifyClones();
3727
3728	if (VerifyCCG) {
3729	check();
3730	}
3731
3732	if (DumpCCG) {
3733	dbgs() << "CCG after cloning:\n";
3734	dbgs() << *this;
3735	}
3736	if (ExportToDot)
3737	exportToDot(Label: "cloned");
3738
3739	bool Changed = assignFunctions();
3740
3741	if (DumpCCG) {
3742	dbgs() << "CCG after assigning function clones:\n";
3743	dbgs() << *this;
3744	}
3745	if (ExportToDot)
3746	exportToDot(Label: "clonefuncassign");
3747
3748	return Changed;
3749	}
3750
3751	bool MemProfContextDisambiguation::processModule(
3752	Module &M,
3753	llvm::function_ref<OptimizationRemarkEmitter &(Function *)> OREGetter) {
3754
3755	// If we have an import summary, then the cloning decisions were made during
3756	// the thin link on the index. Apply them and return.
3757	if (ImportSummary)
3758	return applyImport(M);
3759
3760	// TODO: If/when other types of memprof cloning are enabled beyond just for
3761	// hot and cold, we will need to change this to individually control the
3762	// AllocationType passed to addStackNodesForMIB during CCG construction.
3763	// Note that we specifically check this after applying imports above, so that
3764	// the option isn't needed to be passed to distributed ThinLTO backend
3765	// clang processes, which won't necessarily have visibility into the linker
3766	// dependences. Instead the information is communicated from the LTO link to
3767	// the backends via the combined summary index.
3768	if (!SupportsHotColdNew)
3769	return false;
3770
3771	ModuleCallsiteContextGraph CCG(M, OREGetter);
3772	return CCG.process();
3773	}
3774
3775	MemProfContextDisambiguation::MemProfContextDisambiguation(
3776	const ModuleSummaryIndex *Summary)
3777	: ImportSummary(Summary) {
3778	if (ImportSummary) {
3779	// The MemProfImportSummary should only be used for testing ThinLTO
3780	// distributed backend handling via opt, in which case we don't have a
3781	// summary from the pass pipeline.
3782	assert(MemProfImportSummary.empty());
3783	return;
3784	}
3785	if (MemProfImportSummary.empty())
3786	return;
3787
3788	auto ReadSummaryFile =
3789	errorOrToExpected(EO: MemoryBuffer::getFile(Filename: MemProfImportSummary));
3790	if (!ReadSummaryFile) {
3791	logAllUnhandledErrors(E: ReadSummaryFile.takeError(), OS&: errs(),
3792	ErrorBanner: "Error loading file '" + MemProfImportSummary +
3793	"': ");
3794	return;
3795	}
3796	auto ImportSummaryForTestingOrErr = getModuleSummaryIndex(Buffer: **ReadSummaryFile);
3797	if (!ImportSummaryForTestingOrErr) {
3798	logAllUnhandledErrors(E: ImportSummaryForTestingOrErr.takeError(), OS&: errs(),
3799	ErrorBanner: "Error parsing file '" + MemProfImportSummary +
3800	"': ");
3801	return;
3802	}
3803	ImportSummaryForTesting = std::move(*ImportSummaryForTestingOrErr);
3804	ImportSummary = ImportSummaryForTesting.get();
3805	}
3806
3807	PreservedAnalyses MemProfContextDisambiguation::run(Module &M,
3808	ModuleAnalysisManager &AM) {
3809	auto &FAM = AM.getResult<FunctionAnalysisManagerModuleProxy>(IR&: M).getManager();
3810	auto OREGetter = [&](Function *F) -> OptimizationRemarkEmitter & {
3811	return FAM.getResult<OptimizationRemarkEmitterAnalysis>(IR&: *F);
3812	};
3813	if (!processModule(M, OREGetter))
3814	return PreservedAnalyses::all();
3815	return PreservedAnalyses::none();
3816	}
3817
3818	void MemProfContextDisambiguation::run(
3819	ModuleSummaryIndex &Index,
3820	llvm::function_ref<bool(GlobalValue::GUID, const GlobalValueSummary *)>
3821	isPrevailing) {
3822	// TODO: If/when other types of memprof cloning are enabled beyond just for
3823	// hot and cold, we will need to change this to individually control the
3824	// AllocationType passed to addStackNodesForMIB during CCG construction.
3825	// The index was set from the option, so these should be in sync.
3826	assert(Index.withSupportsHotColdNew() == SupportsHotColdNew);
3827	if (!SupportsHotColdNew)
3828	return;
3829
3830	IndexCallsiteContextGraph CCG(Index, isPrevailing);
3831	CCG.process();
3832	}
3833

source code of llvm/lib/Transforms/IPO/MemProfContextDisambiguation.cpp