1//===-- xray_profile_collector.cpp -----------------------------*- C++ -*-===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9// This file is a part of XRay, a dynamic runtime instrumentation system.
10//
11// This implements the interface for the profileCollectorService.
12//
13//===----------------------------------------------------------------------===//
14#include "xray_profile_collector.h"
15#include "sanitizer_common/sanitizer_common.h"
16#include "xray_allocator.h"
17#include "xray_defs.h"
18#include "xray_profiling_flags.h"
19#include "xray_segmented_array.h"
20#include <memory>
21#include <pthread.h>
22#include <utility>
23
24namespace __xray {
25namespace profileCollectorService {
26
27namespace {
28
29SpinMutex GlobalMutex;
30struct ThreadTrie {
31 tid_t TId;
32 typename std::aligned_storage<sizeof(FunctionCallTrie)>::type TrieStorage;
33};
34
35struct ProfileBuffer {
36 void *Data;
37 size_t Size;
38};
39
40// Current version of the profile format.
41constexpr u64 XRayProfilingVersion = 0x20180424;
42
43// Identifier for XRay profiling files 'xrayprof' in hex.
44constexpr u64 XRayMagicBytes = 0x7872617970726f66;
45
46struct XRayProfilingFileHeader {
47 const u64 MagicBytes = XRayMagicBytes;
48 const u64 Version = XRayProfilingVersion;
49 u64 Timestamp = 0; // System time in nanoseconds.
50 u64 PID = 0; // Process ID.
51};
52
53struct BlockHeader {
54 u32 BlockSize;
55 u32 BlockNum;
56 u64 ThreadId;
57};
58
59struct ThreadData {
60 BufferQueue *BQ;
61 FunctionCallTrie::Allocators::Buffers Buffers;
62 FunctionCallTrie::Allocators Allocators;
63 FunctionCallTrie FCT;
64 tid_t TId;
65};
66
67using ThreadDataArray = Array<ThreadData>;
68using ThreadDataAllocator = ThreadDataArray::AllocatorType;
69
70// We use a separate buffer queue for the backing store for the allocator used
71// by the ThreadData array. This lets us host the buffers, allocators, and tries
72// associated with a thread by moving the data into the array instead of
73// attempting to copy the data to a separately backed set of tries.
74static typename std::aligned_storage<
75 sizeof(BufferQueue), alignof(BufferQueue)>::type BufferQueueStorage;
76static BufferQueue *BQ = nullptr;
77static BufferQueue::Buffer Buffer;
78static typename std::aligned_storage<sizeof(ThreadDataAllocator),
79 alignof(ThreadDataAllocator)>::type
80 ThreadDataAllocatorStorage;
81static typename std::aligned_storage<sizeof(ThreadDataArray),
82 alignof(ThreadDataArray)>::type
83 ThreadDataArrayStorage;
84
85static ThreadDataAllocator *TDAllocator = nullptr;
86static ThreadDataArray *TDArray = nullptr;
87
88using ProfileBufferArray = Array<ProfileBuffer>;
89using ProfileBufferArrayAllocator = typename ProfileBufferArray::AllocatorType;
90
91// These need to be global aligned storage to avoid dynamic initialization. We
92// need these to be aligned to allow us to placement new objects into the
93// storage, and have pointers to those objects be appropriately aligned.
94static typename std::aligned_storage<sizeof(ProfileBufferArray)>::type
95 ProfileBuffersStorage;
96static typename std::aligned_storage<sizeof(ProfileBufferArrayAllocator)>::type
97 ProfileBufferArrayAllocatorStorage;
98
99static ProfileBufferArrayAllocator *ProfileBuffersAllocator = nullptr;
100static ProfileBufferArray *ProfileBuffers = nullptr;
101
102// Use a global flag to determine whether the collector implementation has been
103// initialized.
104static atomic_uint8_t CollectorInitialized{.val_dont_use: 0};
105
106} // namespace
107
108void post(BufferQueue *Q, FunctionCallTrie &&T,
109 FunctionCallTrie::Allocators &&A,
110 FunctionCallTrie::Allocators::Buffers &&B,
111 tid_t TId) XRAY_NEVER_INSTRUMENT {
112 DCHECK_NE(Q, nullptr);
113
114 // Bail out early if the collector has not been initialized.
115 if (!atomic_load(a: &CollectorInitialized, mo: memory_order_acquire)) {
116 T.~FunctionCallTrie();
117 A.~Allocators();
118 Q->releaseBuffer(Buf&: B.NodeBuffer);
119 Q->releaseBuffer(Buf&: B.RootsBuffer);
120 Q->releaseBuffer(Buf&: B.ShadowStackBuffer);
121 Q->releaseBuffer(Buf&: B.NodeIdPairBuffer);
122 B.~Buffers();
123 return;
124 }
125
126 {
127 SpinMutexLock Lock(&GlobalMutex);
128 DCHECK_NE(TDAllocator, nullptr);
129 DCHECK_NE(TDArray, nullptr);
130
131 if (TDArray->AppendEmplace(args&: Q, args: std::move(t&: B), args: std::move(t&: A), args: std::move(t&: T),
132 args&: TId) == nullptr) {
133 // If we fail to add the data to the array, we should destroy the objects
134 // handed us.
135 T.~FunctionCallTrie();
136 A.~Allocators();
137 Q->releaseBuffer(Buf&: B.NodeBuffer);
138 Q->releaseBuffer(Buf&: B.RootsBuffer);
139 Q->releaseBuffer(Buf&: B.ShadowStackBuffer);
140 Q->releaseBuffer(Buf&: B.NodeIdPairBuffer);
141 B.~Buffers();
142 }
143 }
144}
145
146// A PathArray represents the function id's representing a stack trace. In this
147// context a path is almost always represented from the leaf function in a call
148// stack to a root of the call trie.
149using PathArray = Array<int32_t>;
150
151struct ProfileRecord {
152 using PathAllocator = typename PathArray::AllocatorType;
153
154 // The Path in this record is the function id's from the leaf to the root of
155 // the function call stack as represented from a FunctionCallTrie.
156 PathArray Path;
157 const FunctionCallTrie::Node *Node;
158};
159
160namespace {
161
162using ProfileRecordArray = Array<ProfileRecord>;
163
164// Walk a depth-first traversal of each root of the FunctionCallTrie to generate
165// the path(s) and the data associated with the path.
166static void
167populateRecords(ProfileRecordArray &PRs, ProfileRecord::PathAllocator &PA,
168 const FunctionCallTrie &Trie) XRAY_NEVER_INSTRUMENT {
169 using StackArray = Array<const FunctionCallTrie::Node *>;
170 using StackAllocator = typename StackArray::AllocatorType;
171 StackAllocator StackAlloc(profilingFlags()->stack_allocator_max);
172 StackArray DFSStack(StackAlloc);
173 for (const auto *R : Trie.getRoots()) {
174 DFSStack.Append(E: R);
175 while (!DFSStack.empty()) {
176 auto *Node = DFSStack.back();
177 DFSStack.trim(Elements: 1);
178 if (Node == nullptr)
179 continue;
180 auto Record = PRs.AppendEmplace(args: PathArray{PA}, args&: Node);
181 if (Record == nullptr)
182 return;
183 DCHECK_NE(Record, nullptr);
184
185 // Traverse the Node's parents and as we're doing so, get the FIds in
186 // the order they appear.
187 for (auto N = Node; N != nullptr; N = N->Parent)
188 Record->Path.Append(E: N->FId);
189 DCHECK(!Record->Path.empty());
190
191 for (const auto C : Node->Callees)
192 DFSStack.Append(E: C.NodePtr);
193 }
194 }
195}
196
197static void serializeRecords(ProfileBuffer *Buffer, const BlockHeader &Header,
198 const ProfileRecordArray &ProfileRecords)
199 XRAY_NEVER_INSTRUMENT {
200 auto NextPtr = static_cast<uint8_t *>(
201 internal_memcpy(dest: Buffer->Data, src: &Header, n: sizeof(Header))) +
202 sizeof(Header);
203 for (const auto &Record : ProfileRecords) {
204 // List of IDs follow:
205 for (const auto FId : Record.Path)
206 NextPtr =
207 static_cast<uint8_t *>(internal_memcpy(dest: NextPtr, src: &FId, n: sizeof(FId))) +
208 sizeof(FId);
209
210 // Add the sentinel here.
211 constexpr int32_t SentinelFId = 0;
212 NextPtr = static_cast<uint8_t *>(
213 internal_memset(s: NextPtr, c: SentinelFId, n: sizeof(SentinelFId))) +
214 sizeof(SentinelFId);
215
216 // Add the node data here.
217 NextPtr =
218 static_cast<uint8_t *>(internal_memcpy(
219 dest: NextPtr, src: &Record.Node->CallCount, n: sizeof(Record.Node->CallCount))) +
220 sizeof(Record.Node->CallCount);
221 NextPtr = static_cast<uint8_t *>(
222 internal_memcpy(dest: NextPtr, src: &Record.Node->CumulativeLocalTime,
223 n: sizeof(Record.Node->CumulativeLocalTime))) +
224 sizeof(Record.Node->CumulativeLocalTime);
225 }
226
227 DCHECK_EQ(NextPtr - static_cast<uint8_t *>(Buffer->Data), Buffer->Size);
228}
229
230} // namespace
231
232void serialize() XRAY_NEVER_INSTRUMENT {
233 if (!atomic_load(a: &CollectorInitialized, mo: memory_order_acquire))
234 return;
235
236 SpinMutexLock Lock(&GlobalMutex);
237
238 // Clear out the global ProfileBuffers, if it's not empty.
239 for (auto &B : *ProfileBuffers)
240 deallocateBuffer(B: reinterpret_cast<unsigned char *>(B.Data), S: B.Size);
241 ProfileBuffers->trim(Elements: ProfileBuffers->size());
242
243 DCHECK_NE(TDArray, nullptr);
244 if (TDArray->empty())
245 return;
246
247 // Then repopulate the global ProfileBuffers.
248 u32 I = 0;
249 auto MaxSize = profilingFlags()->global_allocator_max;
250 auto ProfileArena = allocateBuffer(S: MaxSize);
251 if (ProfileArena == nullptr)
252 return;
253
254 auto ProfileArenaCleanup = at_scope_exit(
255 fn: [&]() XRAY_NEVER_INSTRUMENT { deallocateBuffer(B: ProfileArena, S: MaxSize); });
256
257 auto PathArena = allocateBuffer(S: profilingFlags()->global_allocator_max);
258 if (PathArena == nullptr)
259 return;
260
261 auto PathArenaCleanup = at_scope_exit(
262 fn: [&]() XRAY_NEVER_INSTRUMENT { deallocateBuffer(B: PathArena, S: MaxSize); });
263
264 for (const auto &ThreadTrie : *TDArray) {
265 using ProfileRecordAllocator = typename ProfileRecordArray::AllocatorType;
266 ProfileRecordAllocator PRAlloc(ProfileArena,
267 profilingFlags()->global_allocator_max);
268 ProfileRecord::PathAllocator PathAlloc(
269 PathArena, profilingFlags()->global_allocator_max);
270 ProfileRecordArray ProfileRecords(PRAlloc);
271
272 // First, we want to compute the amount of space we're going to need. We'll
273 // use a local allocator and an __xray::Array<...> to store the intermediary
274 // data, then compute the size as we're going along. Then we'll allocate the
275 // contiguous space to contain the thread buffer data.
276 if (ThreadTrie.FCT.getRoots().empty())
277 continue;
278
279 populateRecords(PRs&: ProfileRecords, PA&: PathAlloc, Trie: ThreadTrie.FCT);
280 DCHECK(!ThreadTrie.FCT.getRoots().empty());
281 DCHECK(!ProfileRecords.empty());
282
283 // Go through each record, to compute the sizes.
284 //
285 // header size = block size (4 bytes)
286 // + block number (4 bytes)
287 // + thread id (8 bytes)
288 // record size = path ids (4 bytes * number of ids + sentinel 4 bytes)
289 // + call count (8 bytes)
290 // + local time (8 bytes)
291 // + end of record (8 bytes)
292 u32 CumulativeSizes = 0;
293 for (const auto &Record : ProfileRecords)
294 CumulativeSizes += 20 + (4 * Record.Path.size());
295
296 BlockHeader Header{.BlockSize: 16 + CumulativeSizes, .BlockNum: I++, .ThreadId: ThreadTrie.TId};
297 auto B = ProfileBuffers->Append(E: {});
298 B->Size = sizeof(Header) + CumulativeSizes;
299 B->Data = allocateBuffer(S: B->Size);
300 DCHECK_NE(B->Data, nullptr);
301 serializeRecords(Buffer: B, Header, ProfileRecords);
302 }
303}
304
305void reset() XRAY_NEVER_INSTRUMENT {
306 atomic_store(a: &CollectorInitialized, v: 0, mo: memory_order_release);
307 SpinMutexLock Lock(&GlobalMutex);
308
309 if (ProfileBuffers != nullptr) {
310 // Clear out the profile buffers that have been serialized.
311 for (auto &B : *ProfileBuffers)
312 deallocateBuffer(B: reinterpret_cast<uint8_t *>(B.Data), S: B.Size);
313 ProfileBuffers->trim(Elements: ProfileBuffers->size());
314 ProfileBuffers = nullptr;
315 }
316
317 if (TDArray != nullptr) {
318 // Release the resources as required.
319 for (auto &TD : *TDArray) {
320 TD.BQ->releaseBuffer(Buf&: TD.Buffers.NodeBuffer);
321 TD.BQ->releaseBuffer(Buf&: TD.Buffers.RootsBuffer);
322 TD.BQ->releaseBuffer(Buf&: TD.Buffers.ShadowStackBuffer);
323 TD.BQ->releaseBuffer(Buf&: TD.Buffers.NodeIdPairBuffer);
324 }
325 // We don't bother destroying the array here because we've already
326 // potentially freed the backing store for the array. Instead we're going to
327 // reset the pointer to nullptr, and re-use the storage later instead
328 // (placement-new'ing into the storage as-is).
329 TDArray = nullptr;
330 }
331
332 if (TDAllocator != nullptr) {
333 TDAllocator->~Allocator();
334 TDAllocator = nullptr;
335 }
336
337 if (Buffer.Data != nullptr) {
338 BQ->releaseBuffer(Buf&: Buffer);
339 }
340
341 if (BQ == nullptr) {
342 bool Success = false;
343 new (&BufferQueueStorage)
344 BufferQueue(profilingFlags()->global_allocator_max, 1, Success);
345 if (!Success)
346 return;
347 BQ = reinterpret_cast<BufferQueue *>(&BufferQueueStorage);
348 } else {
349 BQ->finalize();
350
351 if (BQ->init(BS: profilingFlags()->global_allocator_max, BC: 1) !=
352 BufferQueue::ErrorCode::Ok)
353 return;
354 }
355
356 if (BQ->getBuffer(Buf&: Buffer) != BufferQueue::ErrorCode::Ok)
357 return;
358
359 new (&ProfileBufferArrayAllocatorStorage)
360 ProfileBufferArrayAllocator(profilingFlags()->global_allocator_max);
361 ProfileBuffersAllocator = reinterpret_cast<ProfileBufferArrayAllocator *>(
362 &ProfileBufferArrayAllocatorStorage);
363
364 new (&ProfileBuffersStorage) ProfileBufferArray(*ProfileBuffersAllocator);
365 ProfileBuffers =
366 reinterpret_cast<ProfileBufferArray *>(&ProfileBuffersStorage);
367
368 new (&ThreadDataAllocatorStorage)
369 ThreadDataAllocator(Buffer.Data, Buffer.Size);
370 TDAllocator =
371 reinterpret_cast<ThreadDataAllocator *>(&ThreadDataAllocatorStorage);
372 new (&ThreadDataArrayStorage) ThreadDataArray(*TDAllocator);
373 TDArray = reinterpret_cast<ThreadDataArray *>(&ThreadDataArrayStorage);
374
375 atomic_store(a: &CollectorInitialized, v: 1, mo: memory_order_release);
376}
377
378XRayBuffer nextBuffer(XRayBuffer B) XRAY_NEVER_INSTRUMENT {
379 SpinMutexLock Lock(&GlobalMutex);
380
381 if (ProfileBuffers == nullptr || ProfileBuffers->size() == 0)
382 return {.Data: nullptr, .Size: 0};
383
384 static pthread_once_t Once = PTHREAD_ONCE_INIT;
385 static typename std::aligned_storage<sizeof(XRayProfilingFileHeader)>::type
386 FileHeaderStorage;
387 pthread_once(
388 once_control: &Once, init_routine: +[]() XRAY_NEVER_INSTRUMENT {
389 new (&FileHeaderStorage) XRayProfilingFileHeader{};
390 });
391
392 if (UNLIKELY(B.Data == nullptr)) {
393 // The first buffer should always contain the file header information.
394 auto &FileHeader =
395 *reinterpret_cast<XRayProfilingFileHeader *>(&FileHeaderStorage);
396 FileHeader.Timestamp = NanoTime();
397 FileHeader.PID = internal_getpid();
398 return {.Data: &FileHeaderStorage, .Size: sizeof(XRayProfilingFileHeader)};
399 }
400
401 if (UNLIKELY(B.Data == &FileHeaderStorage))
402 return {.Data: (*ProfileBuffers)[0].Data, .Size: (*ProfileBuffers)[0].Size};
403
404 BlockHeader Header;
405 internal_memcpy(dest: &Header, src: B.Data, n: sizeof(BlockHeader));
406 auto NextBlock = Header.BlockNum + 1;
407 if (NextBlock < ProfileBuffers->size())
408 return {.Data: (*ProfileBuffers)[NextBlock].Data,
409 .Size: (*ProfileBuffers)[NextBlock].Size};
410 return {.Data: nullptr, .Size: 0};
411}
412
413} // namespace profileCollectorService
414} // namespace __xray
415

source code of compiler-rt/lib/xray/xray_profile_collector.cpp