1//===- RootAutodetector.cpp - detect contextual profiling roots -----------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8
9#include "RootAutoDetector.h"
10
11#include "CtxInstrProfiling.h"
12#include "sanitizer_common/sanitizer_common.h"
13#include "sanitizer_common/sanitizer_placement_new.h" // IWYU pragma: keep (DenseMap)
14#include <assert.h>
15#include <dlfcn.h>
16#include <pthread.h>
17
18using namespace __ctx_profile;
19template <typename T> using Set = DenseMap<T, bool>;
20
21namespace __sanitizer {
22void BufferedStackTrace::UnwindImpl(uptr pc, uptr bp, void *context,
23 bool request_fast, u32 max_depth) {
24 // We can't implement the fast variant. The fast variant ends up invoking an
25 // external allocator, because of pthread_attr_getstack. If this happens
26 // during an allocation of the program being instrumented, a non-reentrant
27 // lock may be taken (this was observed). The allocator called by
28 // pthread_attr_getstack will also try to take that lock.
29 UnwindSlow(pc, max_depth);
30}
31} // namespace __sanitizer
32
33RootAutoDetector::PerThreadSamples::PerThreadSamples(RootAutoDetector &Parent) {
34 GenericScopedLock<SpinMutex> L(&Parent.AllSamplesMutex);
35 Parent.AllSamples.PushBack(v: this);
36}
37
38void RootAutoDetector::start() {
39 atomic_store_relaxed(a: &Self, v: reinterpret_cast<uintptr_t>(this));
40 pthread_create(
41 newthread: &WorkerThread, attr: nullptr,
42 start_routine: +[](void *Ctx) -> void * {
43 RootAutoDetector *RAD = reinterpret_cast<RootAutoDetector *>(Ctx);
44 SleepForSeconds(seconds: RAD->WaitSeconds);
45 // To avoid holding the AllSamplesMutex, make a snapshot of all the
46 // thread samples collected so far
47 Vector<PerThreadSamples *> SamplesSnapshot;
48 {
49 GenericScopedLock<SpinMutex> M(&RAD->AllSamplesMutex);
50 SamplesSnapshot.Resize(size: RAD->AllSamples.Size());
51 for (uptr I = 0; I < RAD->AllSamples.Size(); ++I)
52 SamplesSnapshot[I] = RAD->AllSamples[I];
53 }
54 DenseMap<uptr, uint64_t> AllRoots;
55 for (uptr I = 0; I < SamplesSnapshot.Size(); ++I) {
56 GenericScopedLock<SpinMutex>(&SamplesSnapshot[I]->M);
57 SamplesSnapshot[I]->TrieRoot.determineRoots().forEach(fn: [&](auto &KVP) {
58 auto [FAddr, Count] = KVP;
59 AllRoots[FAddr] += Count;
60 return true;
61 });
62 }
63 // FIXME: as a next step, establish a minimum relative nr of samples
64 // per root that would qualify it as a root.
65 for (auto *FD = reinterpret_cast<FunctionData *>(
66 atomic_load_relaxed(a: &RAD->FunctionDataListHead));
67 FD; FD = FD->Next) {
68 if (AllRoots.contains(Key: reinterpret_cast<uptr>(FD->EntryAddress))) {
69 if (canBeRoot(Ctx: FD->CtxRoot)) {
70 FD->getOrAllocateContextRoot();
71 } else {
72 // FIXME: address this by informing the root detection algorithm
73 // to skip over such functions and pick the next down in the
74 // stack. At that point, this becomes an assert.
75 Printf(format: "[ctxprof] Root auto-detector selected a musttail "
76 "function for root (%p). Ignoring\n",
77 FD->EntryAddress);
78 }
79 }
80 }
81 atomic_store_relaxed(a: &RAD->Self, v: 0);
82 return nullptr;
83 },
84 arg: this);
85}
86
87void RootAutoDetector::join() { pthread_join(th: WorkerThread, thread_return: nullptr); }
88
89void RootAutoDetector::sample() {
90 // tracking reentry in case we want to re-explore fast stack unwind - which
91 // does potentially re-enter the runtime because it calls the instrumented
92 // allocator because of pthread_attr_getstack. See the notes also on
93 // UnwindImpl above.
94 static thread_local bool Entered = false;
95 static thread_local uint64_t Entries = 0;
96 if (Entered || (++Entries % SampleRate))
97 return;
98 Entered = true;
99 collectStack();
100 Entered = false;
101}
102
103void RootAutoDetector::collectStack() {
104 GET_CALLER_PC_BP;
105 BufferedStackTrace CurrentStack;
106 CurrentStack.Unwind(pc, bp, /*context=*/nullptr, /*request_fast=*/false);
107 // 2 stack frames would be very unlikely to mean anything, since at least the
108 // compiler-rt frame - which can't be inlined - should be observable, which
109 // counts as 1; we can be even more aggressive with this number.
110 if (CurrentStack.size <= 2)
111 return;
112 static thread_local PerThreadSamples *ThisThreadSamples =
113 new (__sanitizer::InternalAlloc(size: sizeof(PerThreadSamples)))
114 PerThreadSamples(*this);
115
116 if (!ThisThreadSamples->M.TryLock())
117 return;
118
119 ThisThreadSamples->TrieRoot.insertStack(ST: CurrentStack);
120 ThisThreadSamples->M.Unlock();
121}
122
123uptr PerThreadCallsiteTrie::getFctStartAddr(uptr CallsiteAddress) const {
124 // this requires --linkopt=-Wl,--export-dynamic
125 Dl_info Info;
126 if (dladdr(address: reinterpret_cast<const void *>(CallsiteAddress), info: &Info) != 0)
127 return reinterpret_cast<uptr>(Info.dli_saddr);
128 return 0;
129}
130
131void PerThreadCallsiteTrie::insertStack(const StackTrace &ST) {
132 ++TheTrie.Count;
133 auto *Current = &TheTrie;
134 // the stack is backwards - the first callsite is at the top.
135 for (int I = ST.size - 1; I >= 0; --I) {
136 uptr ChildAddr = ST.trace[I];
137 auto [Iter, _] = Current->Children.insert(KV: {ChildAddr, Trie(ChildAddr)});
138 ++Iter->second.Count;
139 Current = &Iter->second;
140 }
141}
142
143DenseMap<uptr, uint64_t> PerThreadCallsiteTrie::determineRoots() const {
144 // Assuming a message pump design, roots are those functions called by the
145 // message pump. The message pump is an infinite loop (for all practical
146 // considerations) fetching data from a queue. The root functions return -
147 // otherwise the message pump doesn't work. This function detects roots as the
148 // first place in the trie (starting from the root) where a function calls 2
149 // or more functions.
150 //
151 // We start with a callsite trie - the nodes are callsites. Different child
152 // nodes may actually correspond to the same function.
153 //
154 // For example: using function(callsite)
155 // f1(csf1_1) -> f2(csf2_1) -> f3
156 // -> f2(csf2_2) -> f4
157 //
158 // would be represented in our trie as:
159 // csf1_1 -> csf2_1 -> f3
160 // -> csf2_2 -> f4
161 //
162 // While we can assert the control flow returns to f2, we don't know if it
163 // ever returns to f1. f2 could be the message pump.
164 //
165 // We need to convert our callsite tree into a function tree. We can also,
166 // more economically, just see how many distinct functions there are at a
167 // certain depth. When that count is greater than 1, we got to potential roots
168 // and everything above should be considered as non-roots.
169 DenseMap<uptr, uint64_t> Result;
170 Set<const Trie *> Worklist;
171 Worklist.insert(KV: {&TheTrie, {}});
172
173 while (!Worklist.empty()) {
174 Set<const Trie *> NextWorklist;
175 DenseMap<uptr, uint64_t> Candidates;
176 Worklist.forEach(fn: [&](const auto &KVP) {
177 auto [Node, _] = KVP;
178 auto SA = getFctStartAddr(CallsiteAddress: Node->CallsiteAddress);
179 Candidates[SA] += Node->Count;
180 Node->Children.forEach([&](auto &ChildKVP) {
181 NextWorklist.insert({&ChildKVP.second, true});
182 return true;
183 });
184 return true;
185 });
186 if (Candidates.size() > 1) {
187 Result.swap(RHS&: Candidates);
188 break;
189 }
190 Worklist.swap(RHS&: NextWorklist);
191 }
192 return Result;
193}
194

Provided by KDAB

Privacy Policy
Improve your Profiling and Debugging skills
Find out more

source code of compiler-rt/lib/ctx_profile/RootAutoDetector.cpp