RootAutoDetector.cpp source code [compiler-rt/lib/ctx_profile/RootAutoDetector.cpp]

1	//===- RootAutodetector.cpp - detect contextual profiling roots -----------===//
2	//
3	// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4	// See https://llvm.org/LICENSE.txt for license information.
5	// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6	//
7	//===----------------------------------------------------------------------===//
8
9	#include "RootAutoDetector.h"
10
11	#include "CtxInstrProfiling.h"
12	#include "sanitizer_common/sanitizer_common.h"
13	#include "sanitizer_common/sanitizer_placement_new.h" // IWYU pragma: keep (DenseMap)
14	#include <assert.h>
15	#include <dlfcn.h>
16	#include <pthread.h>
17
18	using namespace __ctx_profile;
19	template <typename T> using Set = DenseMap<T, bool>;
20
21	namespace __sanitizer {
22	void BufferedStackTrace::UnwindImpl(uptr pc, uptr bp, void *context,
23	bool request_fast, u32 max_depth) {
24	// We can't implement the fast variant. The fast variant ends up invoking an
25	// external allocator, because of pthread_attr_getstack. If this happens
26	// during an allocation of the program being instrumented, a non-reentrant
27	// lock may be taken (this was observed). The allocator called by
28	// pthread_attr_getstack will also try to take that lock.
29	UnwindSlow(pc, max_depth);
30	}
31	} // namespace __sanitizer
32
33	RootAutoDetector::PerThreadSamples::PerThreadSamples(RootAutoDetector &Parent) {
34	GenericScopedLock<SpinMutex> L(&Parent.AllSamplesMutex);
35	Parent.AllSamples.PushBack(v: this);
36	}
37
38	void RootAutoDetector::start() {
39	atomic_store_relaxed(a: &Self, v: reinterpret_cast<uintptr_t>(this));
40	pthread_create(
41	newthread: &WorkerThread, attr: nullptr,
42	start_routine: +[](void Ctx) -> void* * {
43	RootAutoDetector RAD = reinterpret_cast<RootAutoDetector >(Ctx);
44	SleepForSeconds(seconds: RAD->WaitSeconds);
45	// To avoid holding the AllSamplesMutex, make a snapshot of all the
46	// thread samples collected so far
47	Vector<PerThreadSamples *> SamplesSnapshot;
48	{
49	GenericScopedLock<SpinMutex> M(&RAD->AllSamplesMutex);
50	SamplesSnapshot.Resize(size: RAD->AllSamples.Size());
51	for (uptr I = `0`; I < RAD->AllSamples.Size(); ++I)
52	SamplesSnapshot [I] = RAD->AllSamples [I];
53	}
54	DenseMap<uptr, uint64_t> AllRoots;
55	for (uptr I = `0`; I < SamplesSnapshot.Size(); ++I) {
56	GenericScopedLock<SpinMutex>(&SamplesSnapshot [I]->M);
57	SamplesSnapshot [I]->TrieRoot.determineRoots().forEach(fn: [&](auto &KVP) {
58	auto [FAddr, Count] = KVP;
59	AllRoots[FAddr] += Count;
60	return true;
61	});
62	}
63	// FIXME: as a next step, establish a minimum relative nr of samples
64	// per root that would qualify it as a root.
65	for (auto FD = reinterpret_cast<FunctionData >(
66	atomic_load_relaxed(a: &RAD->FunctionDataListHead));
67	FD; FD = FD->Next) {
68	if (AllRoots.contains(Key: reinterpret_cast<uptr>(FD->EntryAddress))) {
69	if (canBeRoot(Ctx: FD->CtxRoot)) {
70	FD->getOrAllocateContextRoot();
71	} else {
72	// FIXME: address this by informing the root detection algorithm
73	// to skip over such functions and pick the next down in the
74	// stack. At that point, this becomes an assert.
75	Printf(format: "[ctxprof] Root auto-detector selected a musttail "
76	"function for root (%p). Ignoring\n",
77	FD->EntryAddress);
78	}
79	}
80	}
81	atomic_store_relaxed(a: &RAD->Self, v: `0`);
82	return nullptr;
83	},
84	arg: this);
85	}
86
87	void RootAutoDetector::join() { pthread_join(th: WorkerThread, thread_return: nullptr); }
88
89	void RootAutoDetector::sample() {
90	// tracking reentry in case we want to re-explore fast stack unwind - which
91	// does potentially re-enter the runtime because it calls the instrumented
92	// allocator because of pthread_attr_getstack. See the notes also on
93	// UnwindImpl above.
94	static thread_local bool Entered = false;
95	static thread_local uint64_t Entries = `0`;
96	if (Entered \|\| (++Entries % SampleRate))
97	return;
98	Entered = true;
99	collectStack();
100	Entered = false;
101	}
102
103	void RootAutoDetector::collectStack() {
104	GET_CALLER_PC_BP;
105	BufferedStackTrace CurrentStack;
106	CurrentStack.Unwind(pc, bp, /context=/nullptr, /request_fast=/false);
107	// 2 stack frames would be very unlikely to mean anything, since at least the
108	// compiler-rt frame - which can't be inlined - should be observable, which
109	// counts as 1; we can be even more aggressive with this number.
110	if (CurrentStack.size <= `2`)
111	return;
112	static thread_local PerThreadSamples *ThisThreadSamples =
113	new (__sanitizer::InternalAlloc(size: sizeof(PerThreadSamples)))
114	PerThreadSamples (*this);
115
116	if (!ThisThreadSamples->M.TryLock())
117	return;
118
119	ThisThreadSamples->TrieRoot.insertStack(ST: CurrentStack);
120	ThisThreadSamples->M.Unlock();
121	}
122
123	uptr PerThreadCallsiteTrie::getFctStartAddr(uptr CallsiteAddress) const {
124	// this requires --linkopt=-Wl,--export-dynamic
125	Dl_info Info;
126	if (dladdr(address: reinterpret_cast<const void *>(CallsiteAddress), info: &Info) != `0`)
127	return reinterpret_cast<uptr>(Info.dli_saddr);
128	return `0`;
129	}
130
131	void PerThreadCallsiteTrie::insertStack(const StackTrace &ST) {
132	++TheTrie.Count;
133	auto *Current = &TheTrie;
134	// the stack is backwards - the first callsite is at the top.
135	for (int I = ST.size - `1`; I >= `0`; --I) {
136	uptr ChildAddr = ST.trace[I];
137	auto [Iter, _] = Current->Children.insert(KV: {ChildAddr, Trie (ChildAddr)});
138	++Iter->second.Count;
139	Current = &Iter->second;
140	}
141	}
142
143	DenseMap<uptr, uint64_t> PerThreadCallsiteTrie::determineRoots() const {
144	// Assuming a message pump design, roots are those functions called by the
145	// message pump. The message pump is an infinite loop (for all practical
146	// considerations) fetching data from a queue. The root functions return -
147	// otherwise the message pump doesn't work. This function detects roots as the
148	// first place in the trie (starting from the root) where a function calls 2
149	// or more functions.
150	//
151	// We start with a callsite trie - the nodes are callsites. Different child
152	// nodes may actually correspond to the same function.
153	//
154	// For example: using function(callsite)
155	// f1(csf1_1) -> f2(csf2_1) -> f3
156	// -> f2(csf2_2) -> f4
157	//
158	// would be represented in our trie as:
159	// csf1_1 -> csf2_1 -> f3
160	// -> csf2_2 -> f4
161	//
162	// While we can assert the control flow returns to f2, we don't know if it
163	// ever returns to f1. f2 could be the message pump.
164	//
165	// We need to convert our callsite tree into a function tree. We can also,
166	// more economically, just see how many distinct functions there are at a
167	// certain depth. When that count is greater than 1, we got to potential roots
168	// and everything above should be considered as non-roots.
169	DenseMap<uptr, uint64_t> Result;
170	Set<const Trie *> Worklist;
171	Worklist.insert(KV: {&TheTrie, {}});
172
173	while (!Worklist.empty()) {
174	Set<const Trie *> NextWorklist;
175	DenseMap<uptr, uint64_t> Candidates;
176	Worklist.forEach(fn: [&](const auto &KVP) {
177	auto [Node, _] = KVP;
178	auto SA = getFctStartAddr(CallsiteAddress: Node->CallsiteAddress);
179	Candidates[SA] += Node->Count;
180	Node->Children.forEach([&](auto &ChildKVP) {
181	NextWorklist.insert({&ChildKVP.second, true});
182	return true;
183	});
184	return true;
185	});
186	if (Candidates.size() > `1`) {
187	Result.swap(RHS&: Candidates);
188	break;
189	}
190	Worklist.swap(RHS&: NextWorklist);
191	}
192	return Result;
193	}
194

Provided by KDAB

Improve your Profiling and Debugging skills

Find out more

Definitions

source code of compiler-rt/lib/ctx_profile/RootAutoDetector.cpp