1 | //===-- xray_profile_collector.cpp -----------------------------*- C++ -*-===// |
2 | // |
3 | // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. |
4 | // See https://llvm.org/LICENSE.txt for license information. |
5 | // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception |
6 | // |
7 | //===----------------------------------------------------------------------===// |
8 | // |
9 | // This file is a part of XRay, a dynamic runtime instrumentation system. |
10 | // |
11 | // This implements the interface for the profileCollectorService. |
12 | // |
13 | //===----------------------------------------------------------------------===// |
14 | #include "xray_profile_collector.h" |
15 | #include "sanitizer_common/sanitizer_common.h" |
16 | #include "xray_allocator.h" |
17 | #include "xray_defs.h" |
18 | #include "xray_profiling_flags.h" |
19 | #include "xray_segmented_array.h" |
20 | #include <memory> |
21 | #include <pthread.h> |
22 | #include <utility> |
23 | |
24 | namespace __xray { |
25 | namespace profileCollectorService { |
26 | |
27 | namespace { |
28 | |
29 | SpinMutex GlobalMutex; |
30 | struct ThreadTrie { |
31 | tid_t TId; |
32 | typename std::aligned_storage<sizeof(FunctionCallTrie)>::type TrieStorage; |
33 | }; |
34 | |
35 | struct ProfileBuffer { |
36 | void *Data; |
37 | size_t Size; |
38 | }; |
39 | |
40 | // Current version of the profile format. |
41 | constexpr u64 XRayProfilingVersion = 0x20180424; |
42 | |
43 | // Identifier for XRay profiling files 'xrayprof' in hex. |
44 | constexpr u64 XRayMagicBytes = 0x7872617970726f66; |
45 | |
46 | struct { |
47 | const u64 = XRayMagicBytes; |
48 | const u64 = XRayProfilingVersion; |
49 | u64 = 0; // System time in nanoseconds. |
50 | u64 = 0; // Process ID. |
51 | }; |
52 | |
53 | struct { |
54 | u32 ; |
55 | u32 ; |
56 | u64 ; |
57 | }; |
58 | |
59 | struct ThreadData { |
60 | BufferQueue *BQ; |
61 | FunctionCallTrie::Allocators::Buffers Buffers; |
62 | FunctionCallTrie::Allocators Allocators; |
63 | FunctionCallTrie FCT; |
64 | tid_t TId; |
65 | }; |
66 | |
67 | using ThreadDataArray = Array<ThreadData>; |
68 | using ThreadDataAllocator = ThreadDataArray::AllocatorType; |
69 | |
70 | // We use a separate buffer queue for the backing store for the allocator used |
71 | // by the ThreadData array. This lets us host the buffers, allocators, and tries |
72 | // associated with a thread by moving the data into the array instead of |
73 | // attempting to copy the data to a separately backed set of tries. |
74 | static typename std::aligned_storage< |
75 | sizeof(BufferQueue), alignof(BufferQueue)>::type BufferQueueStorage; |
76 | static BufferQueue *BQ = nullptr; |
77 | static BufferQueue::Buffer Buffer; |
78 | static typename std::aligned_storage<sizeof(ThreadDataAllocator), |
79 | alignof(ThreadDataAllocator)>::type |
80 | ThreadDataAllocatorStorage; |
81 | static typename std::aligned_storage<sizeof(ThreadDataArray), |
82 | alignof(ThreadDataArray)>::type |
83 | ThreadDataArrayStorage; |
84 | |
85 | static ThreadDataAllocator *TDAllocator = nullptr; |
86 | static ThreadDataArray *TDArray = nullptr; |
87 | |
88 | using ProfileBufferArray = Array<ProfileBuffer>; |
89 | using ProfileBufferArrayAllocator = typename ProfileBufferArray::AllocatorType; |
90 | |
91 | // These need to be global aligned storage to avoid dynamic initialization. We |
92 | // need these to be aligned to allow us to placement new objects into the |
93 | // storage, and have pointers to those objects be appropriately aligned. |
94 | static typename std::aligned_storage<sizeof(ProfileBufferArray)>::type |
95 | ; |
96 | static typename std::aligned_storage<sizeof(ProfileBufferArrayAllocator)>::type |
97 | ProfileBufferArrayAllocatorStorage; |
98 | |
99 | static ProfileBufferArrayAllocator *ProfileBuffersAllocator = nullptr; |
100 | static ProfileBufferArray *ProfileBuffers = nullptr; |
101 | |
102 | // Use a global flag to determine whether the collector implementation has been |
103 | // initialized. |
104 | static atomic_uint8_t CollectorInitialized{.val_dont_use: 0}; |
105 | |
106 | } // namespace |
107 | |
108 | void post(BufferQueue *Q, FunctionCallTrie &&T, |
109 | FunctionCallTrie::Allocators &&A, |
110 | FunctionCallTrie::Allocators::Buffers &&B, |
111 | tid_t TId) XRAY_NEVER_INSTRUMENT { |
112 | DCHECK_NE(Q, nullptr); |
113 | |
114 | // Bail out early if the collector has not been initialized. |
115 | if (!atomic_load(a: &CollectorInitialized, mo: memory_order_acquire)) { |
116 | T.~FunctionCallTrie(); |
117 | A.~Allocators(); |
118 | Q->releaseBuffer(Buf&: B.NodeBuffer); |
119 | Q->releaseBuffer(Buf&: B.RootsBuffer); |
120 | Q->releaseBuffer(Buf&: B.ShadowStackBuffer); |
121 | Q->releaseBuffer(Buf&: B.NodeIdPairBuffer); |
122 | B.~Buffers(); |
123 | return; |
124 | } |
125 | |
126 | { |
127 | SpinMutexLock Lock(&GlobalMutex); |
128 | DCHECK_NE(TDAllocator, nullptr); |
129 | DCHECK_NE(TDArray, nullptr); |
130 | |
131 | if (TDArray->AppendEmplace(args&: Q, args: std::move(t&: B), args: std::move(t&: A), args: std::move(t&: T), |
132 | args&: TId) == nullptr) { |
133 | // If we fail to add the data to the array, we should destroy the objects |
134 | // handed us. |
135 | T.~FunctionCallTrie(); |
136 | A.~Allocators(); |
137 | Q->releaseBuffer(Buf&: B.NodeBuffer); |
138 | Q->releaseBuffer(Buf&: B.RootsBuffer); |
139 | Q->releaseBuffer(Buf&: B.ShadowStackBuffer); |
140 | Q->releaseBuffer(Buf&: B.NodeIdPairBuffer); |
141 | B.~Buffers(); |
142 | } |
143 | } |
144 | } |
145 | |
146 | // A PathArray represents the function id's representing a stack trace. In this |
147 | // context a path is almost always represented from the leaf function in a call |
148 | // stack to a root of the call trie. |
149 | using PathArray = Array<int32_t>; |
150 | |
151 | struct ProfileRecord { |
152 | using PathAllocator = typename PathArray::AllocatorType; |
153 | |
154 | // The Path in this record is the function id's from the leaf to the root of |
155 | // the function call stack as represented from a FunctionCallTrie. |
156 | PathArray Path; |
157 | const FunctionCallTrie::Node *Node; |
158 | }; |
159 | |
160 | namespace { |
161 | |
162 | using ProfileRecordArray = Array<ProfileRecord>; |
163 | |
164 | // Walk a depth-first traversal of each root of the FunctionCallTrie to generate |
165 | // the path(s) and the data associated with the path. |
166 | static void |
167 | populateRecords(ProfileRecordArray &PRs, ProfileRecord::PathAllocator &PA, |
168 | const FunctionCallTrie &Trie) XRAY_NEVER_INSTRUMENT { |
169 | using StackArray = Array<const FunctionCallTrie::Node *>; |
170 | using StackAllocator = typename StackArray::AllocatorType; |
171 | StackAllocator StackAlloc(profilingFlags()->stack_allocator_max); |
172 | StackArray DFSStack(StackAlloc); |
173 | for (const auto *R : Trie.getRoots()) { |
174 | DFSStack.Append(E: R); |
175 | while (!DFSStack.empty()) { |
176 | auto *Node = DFSStack.back(); |
177 | DFSStack.trim(Elements: 1); |
178 | if (Node == nullptr) |
179 | continue; |
180 | auto Record = PRs.AppendEmplace(args: PathArray{PA}, args&: Node); |
181 | if (Record == nullptr) |
182 | return; |
183 | DCHECK_NE(Record, nullptr); |
184 | |
185 | // Traverse the Node's parents and as we're doing so, get the FIds in |
186 | // the order they appear. |
187 | for (auto N = Node; N != nullptr; N = N->Parent) |
188 | Record->Path.Append(E: N->FId); |
189 | DCHECK(!Record->Path.empty()); |
190 | |
191 | for (const auto C : Node->Callees) |
192 | DFSStack.Append(E: C.NodePtr); |
193 | } |
194 | } |
195 | } |
196 | |
197 | static void serializeRecords(ProfileBuffer *Buffer, const BlockHeader &, |
198 | const ProfileRecordArray &ProfileRecords) |
199 | XRAY_NEVER_INSTRUMENT { |
200 | auto NextPtr = static_cast<uint8_t *>( |
201 | internal_memcpy(dest: Buffer->Data, src: &Header, n: sizeof(Header))) + |
202 | sizeof(Header); |
203 | for (const auto &Record : ProfileRecords) { |
204 | // List of IDs follow: |
205 | for (const auto FId : Record.Path) |
206 | NextPtr = |
207 | static_cast<uint8_t *>(internal_memcpy(dest: NextPtr, src: &FId, n: sizeof(FId))) + |
208 | sizeof(FId); |
209 | |
210 | // Add the sentinel here. |
211 | constexpr int32_t SentinelFId = 0; |
212 | NextPtr = static_cast<uint8_t *>( |
213 | internal_memset(s: NextPtr, c: SentinelFId, n: sizeof(SentinelFId))) + |
214 | sizeof(SentinelFId); |
215 | |
216 | // Add the node data here. |
217 | NextPtr = |
218 | static_cast<uint8_t *>(internal_memcpy( |
219 | dest: NextPtr, src: &Record.Node->CallCount, n: sizeof(Record.Node->CallCount))) + |
220 | sizeof(Record.Node->CallCount); |
221 | NextPtr = static_cast<uint8_t *>( |
222 | internal_memcpy(dest: NextPtr, src: &Record.Node->CumulativeLocalTime, |
223 | n: sizeof(Record.Node->CumulativeLocalTime))) + |
224 | sizeof(Record.Node->CumulativeLocalTime); |
225 | } |
226 | |
227 | DCHECK_EQ(NextPtr - static_cast<uint8_t *>(Buffer->Data), Buffer->Size); |
228 | } |
229 | |
230 | } // namespace |
231 | |
232 | void serialize() XRAY_NEVER_INSTRUMENT { |
233 | if (!atomic_load(a: &CollectorInitialized, mo: memory_order_acquire)) |
234 | return; |
235 | |
236 | SpinMutexLock Lock(&GlobalMutex); |
237 | |
238 | // Clear out the global ProfileBuffers, if it's not empty. |
239 | for (auto &B : *ProfileBuffers) |
240 | deallocateBuffer(B: reinterpret_cast<unsigned char *>(B.Data), S: B.Size); |
241 | ProfileBuffers->trim(Elements: ProfileBuffers->size()); |
242 | |
243 | DCHECK_NE(TDArray, nullptr); |
244 | if (TDArray->empty()) |
245 | return; |
246 | |
247 | // Then repopulate the global ProfileBuffers. |
248 | u32 I = 0; |
249 | auto MaxSize = profilingFlags()->global_allocator_max; |
250 | auto ProfileArena = allocateBuffer(S: MaxSize); |
251 | if (ProfileArena == nullptr) |
252 | return; |
253 | |
254 | auto ProfileArenaCleanup = at_scope_exit( |
255 | fn: [&]() XRAY_NEVER_INSTRUMENT { deallocateBuffer(B: ProfileArena, S: MaxSize); }); |
256 | |
257 | auto PathArena = allocateBuffer(S: profilingFlags()->global_allocator_max); |
258 | if (PathArena == nullptr) |
259 | return; |
260 | |
261 | auto PathArenaCleanup = at_scope_exit( |
262 | fn: [&]() XRAY_NEVER_INSTRUMENT { deallocateBuffer(B: PathArena, S: MaxSize); }); |
263 | |
264 | for (const auto &ThreadTrie : *TDArray) { |
265 | using ProfileRecordAllocator = typename ProfileRecordArray::AllocatorType; |
266 | ProfileRecordAllocator PRAlloc(ProfileArena, |
267 | profilingFlags()->global_allocator_max); |
268 | ProfileRecord::PathAllocator PathAlloc( |
269 | PathArena, profilingFlags()->global_allocator_max); |
270 | ProfileRecordArray ProfileRecords(PRAlloc); |
271 | |
272 | // First, we want to compute the amount of space we're going to need. We'll |
273 | // use a local allocator and an __xray::Array<...> to store the intermediary |
274 | // data, then compute the size as we're going along. Then we'll allocate the |
275 | // contiguous space to contain the thread buffer data. |
276 | if (ThreadTrie.FCT.getRoots().empty()) |
277 | continue; |
278 | |
279 | populateRecords(PRs&: ProfileRecords, PA&: PathAlloc, Trie: ThreadTrie.FCT); |
280 | DCHECK(!ThreadTrie.FCT.getRoots().empty()); |
281 | DCHECK(!ProfileRecords.empty()); |
282 | |
283 | // Go through each record, to compute the sizes. |
284 | // |
285 | // header size = block size (4 bytes) |
286 | // + block number (4 bytes) |
287 | // + thread id (8 bytes) |
288 | // record size = path ids (4 bytes * number of ids + sentinel 4 bytes) |
289 | // + call count (8 bytes) |
290 | // + local time (8 bytes) |
291 | // + end of record (8 bytes) |
292 | u32 CumulativeSizes = 0; |
293 | for (const auto &Record : ProfileRecords) |
294 | CumulativeSizes += 20 + (4 * Record.Path.size()); |
295 | |
296 | BlockHeader {.BlockSize: 16 + CumulativeSizes, .BlockNum: I++, .ThreadId: ThreadTrie.TId}; |
297 | auto B = ProfileBuffers->Append(E: {}); |
298 | B->Size = sizeof(Header) + CumulativeSizes; |
299 | B->Data = allocateBuffer(S: B->Size); |
300 | DCHECK_NE(B->Data, nullptr); |
301 | serializeRecords(Buffer: B, Header, ProfileRecords); |
302 | } |
303 | } |
304 | |
305 | void reset() XRAY_NEVER_INSTRUMENT { |
306 | atomic_store(a: &CollectorInitialized, v: 0, mo: memory_order_release); |
307 | SpinMutexLock Lock(&GlobalMutex); |
308 | |
309 | if (ProfileBuffers != nullptr) { |
310 | // Clear out the profile buffers that have been serialized. |
311 | for (auto &B : *ProfileBuffers) |
312 | deallocateBuffer(B: reinterpret_cast<uint8_t *>(B.Data), S: B.Size); |
313 | ProfileBuffers->trim(Elements: ProfileBuffers->size()); |
314 | ProfileBuffers = nullptr; |
315 | } |
316 | |
317 | if (TDArray != nullptr) { |
318 | // Release the resources as required. |
319 | for (auto &TD : *TDArray) { |
320 | TD.BQ->releaseBuffer(Buf&: TD.Buffers.NodeBuffer); |
321 | TD.BQ->releaseBuffer(Buf&: TD.Buffers.RootsBuffer); |
322 | TD.BQ->releaseBuffer(Buf&: TD.Buffers.ShadowStackBuffer); |
323 | TD.BQ->releaseBuffer(Buf&: TD.Buffers.NodeIdPairBuffer); |
324 | } |
325 | // We don't bother destroying the array here because we've already |
326 | // potentially freed the backing store for the array. Instead we're going to |
327 | // reset the pointer to nullptr, and re-use the storage later instead |
328 | // (placement-new'ing into the storage as-is). |
329 | TDArray = nullptr; |
330 | } |
331 | |
332 | if (TDAllocator != nullptr) { |
333 | TDAllocator->~Allocator(); |
334 | TDAllocator = nullptr; |
335 | } |
336 | |
337 | if (Buffer.Data != nullptr) { |
338 | BQ->releaseBuffer(Buf&: Buffer); |
339 | } |
340 | |
341 | if (BQ == nullptr) { |
342 | bool Success = false; |
343 | new (&BufferQueueStorage) |
344 | BufferQueue(profilingFlags()->global_allocator_max, 1, Success); |
345 | if (!Success) |
346 | return; |
347 | BQ = reinterpret_cast<BufferQueue *>(&BufferQueueStorage); |
348 | } else { |
349 | BQ->finalize(); |
350 | |
351 | if (BQ->init(BS: profilingFlags()->global_allocator_max, BC: 1) != |
352 | BufferQueue::ErrorCode::Ok) |
353 | return; |
354 | } |
355 | |
356 | if (BQ->getBuffer(Buf&: Buffer) != BufferQueue::ErrorCode::Ok) |
357 | return; |
358 | |
359 | new (&ProfileBufferArrayAllocatorStorage) |
360 | ProfileBufferArrayAllocator(profilingFlags()->global_allocator_max); |
361 | ProfileBuffersAllocator = reinterpret_cast<ProfileBufferArrayAllocator *>( |
362 | &ProfileBufferArrayAllocatorStorage); |
363 | |
364 | new (&ProfileBuffersStorage) ProfileBufferArray(*ProfileBuffersAllocator); |
365 | ProfileBuffers = |
366 | reinterpret_cast<ProfileBufferArray *>(&ProfileBuffersStorage); |
367 | |
368 | new (&ThreadDataAllocatorStorage) |
369 | ThreadDataAllocator(Buffer.Data, Buffer.Size); |
370 | TDAllocator = |
371 | reinterpret_cast<ThreadDataAllocator *>(&ThreadDataAllocatorStorage); |
372 | new (&ThreadDataArrayStorage) ThreadDataArray(*TDAllocator); |
373 | TDArray = reinterpret_cast<ThreadDataArray *>(&ThreadDataArrayStorage); |
374 | |
375 | atomic_store(a: &CollectorInitialized, v: 1, mo: memory_order_release); |
376 | } |
377 | |
378 | XRayBuffer nextBuffer(XRayBuffer B) XRAY_NEVER_INSTRUMENT { |
379 | SpinMutexLock Lock(&GlobalMutex); |
380 | |
381 | if (ProfileBuffers == nullptr || ProfileBuffers->size() == 0) |
382 | return {.Data: nullptr, .Size: 0}; |
383 | |
384 | static pthread_once_t Once = PTHREAD_ONCE_INIT; |
385 | static typename std::aligned_storage<sizeof(XRayProfilingFileHeader)>::type |
386 | ; |
387 | pthread_once( |
388 | once_control: &Once, init_routine: +[]() XRAY_NEVER_INSTRUMENT { |
389 | new (&FileHeaderStorage) XRayProfilingFileHeader{}; |
390 | }); |
391 | |
392 | if (UNLIKELY(B.Data == nullptr)) { |
393 | // The first buffer should always contain the file header information. |
394 | auto & = |
395 | *reinterpret_cast<XRayProfilingFileHeader *>(&FileHeaderStorage); |
396 | FileHeader.Timestamp = NanoTime(); |
397 | FileHeader.PID = internal_getpid(); |
398 | return {.Data: &FileHeaderStorage, .Size: sizeof(XRayProfilingFileHeader)}; |
399 | } |
400 | |
401 | if (UNLIKELY(B.Data == &FileHeaderStorage)) |
402 | return {.Data: (*ProfileBuffers)[0].Data, .Size: (*ProfileBuffers)[0].Size}; |
403 | |
404 | BlockHeader ; |
405 | internal_memcpy(dest: &Header, src: B.Data, n: sizeof(BlockHeader)); |
406 | auto NextBlock = Header.BlockNum + 1; |
407 | if (NextBlock < ProfileBuffers->size()) |
408 | return {.Data: (*ProfileBuffers)[NextBlock].Data, |
409 | .Size: (*ProfileBuffers)[NextBlock].Size}; |
410 | return {.Data: nullptr, .Size: 0}; |
411 | } |
412 | |
413 | } // namespace profileCollectorService |
414 | } // namespace __xray |
415 | |