1 | //===- RootAutodetector.cpp - detect contextual profiling roots -----------===// |
2 | // |
3 | // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. |
4 | // See https://llvm.org/LICENSE.txt for license information. |
5 | // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception |
6 | // |
7 | //===----------------------------------------------------------------------===// |
8 | |
9 | #include "RootAutoDetector.h" |
10 | |
11 | #include "CtxInstrProfiling.h" |
12 | #include "sanitizer_common/sanitizer_common.h" |
13 | #include "sanitizer_common/sanitizer_placement_new.h" // IWYU pragma: keep (DenseMap) |
14 | #include <assert.h> |
15 | #include <dlfcn.h> |
16 | #include <pthread.h> |
17 | |
18 | using namespace __ctx_profile; |
19 | template <typename T> using Set = DenseMap<T, bool>; |
20 | |
21 | namespace __sanitizer { |
22 | void BufferedStackTrace::UnwindImpl(uptr pc, uptr bp, void *context, |
23 | bool request_fast, u32 max_depth) { |
24 | // We can't implement the fast variant. The fast variant ends up invoking an |
25 | // external allocator, because of pthread_attr_getstack. If this happens |
26 | // during an allocation of the program being instrumented, a non-reentrant |
27 | // lock may be taken (this was observed). The allocator called by |
28 | // pthread_attr_getstack will also try to take that lock. |
29 | UnwindSlow(pc, max_depth); |
30 | } |
31 | } // namespace __sanitizer |
32 | |
33 | RootAutoDetector::PerThreadSamples::PerThreadSamples(RootAutoDetector &Parent) { |
34 | GenericScopedLock<SpinMutex> L(&Parent.AllSamplesMutex); |
35 | Parent.AllSamples.PushBack(v: this); |
36 | } |
37 | |
38 | void RootAutoDetector::start() { |
39 | atomic_store_relaxed(a: &Self, v: reinterpret_cast<uintptr_t>(this)); |
40 | pthread_create( |
41 | newthread: &WorkerThread, attr: nullptr, |
42 | start_routine: +[](void *Ctx) -> void * { |
43 | RootAutoDetector *RAD = reinterpret_cast<RootAutoDetector *>(Ctx); |
44 | SleepForSeconds(seconds: RAD->WaitSeconds); |
45 | // To avoid holding the AllSamplesMutex, make a snapshot of all the |
46 | // thread samples collected so far |
47 | Vector<PerThreadSamples *> SamplesSnapshot; |
48 | { |
49 | GenericScopedLock<SpinMutex> M(&RAD->AllSamplesMutex); |
50 | SamplesSnapshot.Resize(size: RAD->AllSamples.Size()); |
51 | for (uptr I = 0; I < RAD->AllSamples.Size(); ++I) |
52 | SamplesSnapshot[I] = RAD->AllSamples[I]; |
53 | } |
54 | DenseMap<uptr, uint64_t> AllRoots; |
55 | for (uptr I = 0; I < SamplesSnapshot.Size(); ++I) { |
56 | GenericScopedLock<SpinMutex>(&SamplesSnapshot[I]->M); |
57 | SamplesSnapshot[I]->TrieRoot.determineRoots().forEach(fn: [&](auto &KVP) { |
58 | auto [FAddr, Count] = KVP; |
59 | AllRoots[FAddr] += Count; |
60 | return true; |
61 | }); |
62 | } |
63 | // FIXME: as a next step, establish a minimum relative nr of samples |
64 | // per root that would qualify it as a root. |
65 | for (auto *FD = reinterpret_cast<FunctionData *>( |
66 | atomic_load_relaxed(a: &RAD->FunctionDataListHead)); |
67 | FD; FD = FD->Next) { |
68 | if (AllRoots.contains(Key: reinterpret_cast<uptr>(FD->EntryAddress))) { |
69 | if (canBeRoot(Ctx: FD->CtxRoot)) { |
70 | FD->getOrAllocateContextRoot(); |
71 | } else { |
72 | // FIXME: address this by informing the root detection algorithm |
73 | // to skip over such functions and pick the next down in the |
74 | // stack. At that point, this becomes an assert. |
75 | Printf(format: "[ctxprof] Root auto-detector selected a musttail " |
76 | "function for root (%p). Ignoring\n" , |
77 | FD->EntryAddress); |
78 | } |
79 | } |
80 | } |
81 | atomic_store_relaxed(a: &RAD->Self, v: 0); |
82 | return nullptr; |
83 | }, |
84 | arg: this); |
85 | } |
86 | |
87 | void RootAutoDetector::join() { pthread_join(th: WorkerThread, thread_return: nullptr); } |
88 | |
89 | void RootAutoDetector::sample() { |
90 | // tracking reentry in case we want to re-explore fast stack unwind - which |
91 | // does potentially re-enter the runtime because it calls the instrumented |
92 | // allocator because of pthread_attr_getstack. See the notes also on |
93 | // UnwindImpl above. |
94 | static thread_local bool Entered = false; |
95 | static thread_local uint64_t Entries = 0; |
96 | if (Entered || (++Entries % SampleRate)) |
97 | return; |
98 | Entered = true; |
99 | collectStack(); |
100 | Entered = false; |
101 | } |
102 | |
103 | void RootAutoDetector::collectStack() { |
104 | GET_CALLER_PC_BP; |
105 | BufferedStackTrace CurrentStack; |
106 | CurrentStack.Unwind(pc, bp, /*context=*/nullptr, /*request_fast=*/false); |
107 | // 2 stack frames would be very unlikely to mean anything, since at least the |
108 | // compiler-rt frame - which can't be inlined - should be observable, which |
109 | // counts as 1; we can be even more aggressive with this number. |
110 | if (CurrentStack.size <= 2) |
111 | return; |
112 | static thread_local PerThreadSamples *ThisThreadSamples = |
113 | new (__sanitizer::InternalAlloc(size: sizeof(PerThreadSamples))) |
114 | PerThreadSamples(*this); |
115 | |
116 | if (!ThisThreadSamples->M.TryLock()) |
117 | return; |
118 | |
119 | ThisThreadSamples->TrieRoot.insertStack(ST: CurrentStack); |
120 | ThisThreadSamples->M.Unlock(); |
121 | } |
122 | |
123 | uptr PerThreadCallsiteTrie::getFctStartAddr(uptr CallsiteAddress) const { |
124 | // this requires --linkopt=-Wl,--export-dynamic |
125 | Dl_info Info; |
126 | if (dladdr(address: reinterpret_cast<const void *>(CallsiteAddress), info: &Info) != 0) |
127 | return reinterpret_cast<uptr>(Info.dli_saddr); |
128 | return 0; |
129 | } |
130 | |
131 | void PerThreadCallsiteTrie::insertStack(const StackTrace &ST) { |
132 | ++TheTrie.Count; |
133 | auto *Current = &TheTrie; |
134 | // the stack is backwards - the first callsite is at the top. |
135 | for (int I = ST.size - 1; I >= 0; --I) { |
136 | uptr ChildAddr = ST.trace[I]; |
137 | auto [Iter, _] = Current->Children.insert(KV: {ChildAddr, Trie(ChildAddr)}); |
138 | ++Iter->second.Count; |
139 | Current = &Iter->second; |
140 | } |
141 | } |
142 | |
143 | DenseMap<uptr, uint64_t> PerThreadCallsiteTrie::determineRoots() const { |
144 | // Assuming a message pump design, roots are those functions called by the |
145 | // message pump. The message pump is an infinite loop (for all practical |
146 | // considerations) fetching data from a queue. The root functions return - |
147 | // otherwise the message pump doesn't work. This function detects roots as the |
148 | // first place in the trie (starting from the root) where a function calls 2 |
149 | // or more functions. |
150 | // |
151 | // We start with a callsite trie - the nodes are callsites. Different child |
152 | // nodes may actually correspond to the same function. |
153 | // |
154 | // For example: using function(callsite) |
155 | // f1(csf1_1) -> f2(csf2_1) -> f3 |
156 | // -> f2(csf2_2) -> f4 |
157 | // |
158 | // would be represented in our trie as: |
159 | // csf1_1 -> csf2_1 -> f3 |
160 | // -> csf2_2 -> f4 |
161 | // |
162 | // While we can assert the control flow returns to f2, we don't know if it |
163 | // ever returns to f1. f2 could be the message pump. |
164 | // |
165 | // We need to convert our callsite tree into a function tree. We can also, |
166 | // more economically, just see how many distinct functions there are at a |
167 | // certain depth. When that count is greater than 1, we got to potential roots |
168 | // and everything above should be considered as non-roots. |
169 | DenseMap<uptr, uint64_t> Result; |
170 | Set<const Trie *> Worklist; |
171 | Worklist.insert(KV: {&TheTrie, {}}); |
172 | |
173 | while (!Worklist.empty()) { |
174 | Set<const Trie *> NextWorklist; |
175 | DenseMap<uptr, uint64_t> Candidates; |
176 | Worklist.forEach(fn: [&](const auto &KVP) { |
177 | auto [Node, _] = KVP; |
178 | auto SA = getFctStartAddr(CallsiteAddress: Node->CallsiteAddress); |
179 | Candidates[SA] += Node->Count; |
180 | Node->Children.forEach([&](auto &ChildKVP) { |
181 | NextWorklist.insert({&ChildKVP.second, true}); |
182 | return true; |
183 | }); |
184 | return true; |
185 | }); |
186 | if (Candidates.size() > 1) { |
187 | Result.swap(RHS&: Candidates); |
188 | break; |
189 | } |
190 | Worklist.swap(RHS&: NextWorklist); |
191 | } |
192 | return Result; |
193 | } |
194 | |