1// Copyright 2009-2021 Intel Corporation
2// SPDX-License-Identifier: Apache-2.0
3
4#pragma once
5
6#include "default.h"
7#include "device.h"
8#include "scene.h"
9#include "primref.h"
10
11namespace embree
12{
13 class FastAllocator
14 {
15 /*! maximum supported alignment */
16 static const size_t maxAlignment = 64;
17
18 /*! maximum allocation size */
19
20 /* default settings */
21 //static const size_t defaultBlockSize = 4096;
22#define maxAllocationSize size_t(2*1024*1024-maxAlignment)
23
24 static const size_t MAX_THREAD_USED_BLOCK_SLOTS = 8;
25
26 public:
27
28 struct ThreadLocal2;
29 enum AllocationType { ALIGNED_MALLOC, OS_MALLOC, SHARED, ANY_TYPE };
30
31 /*! Per thread structure holding the current memory block. */
32 struct __aligned(64) ThreadLocal
33 {
34 ALIGNED_CLASS_(64);
35 public:
36
37 /*! Constructor for usage with ThreadLocalData */
38 __forceinline ThreadLocal (ThreadLocal2* parent)
39 : parent(parent), ptr(nullptr), cur(0), end(0), allocBlockSize(0), bytesUsed(0), bytesWasted(0) {}
40
41 /*! initialize allocator */
42 void init(FastAllocator* alloc)
43 {
44 ptr = nullptr;
45 cur = end = 0;
46 bytesUsed = 0;
47 bytesWasted = 0;
48 allocBlockSize = 0;
49 if (alloc) allocBlockSize = alloc->defaultBlockSize;
50 }
51
52 /* Allocate aligned memory from the threads memory block. */
53 __forceinline void* malloc(FastAllocator* alloc, size_t bytes, size_t align = 16)
54 {
55 /* bind the thread local allocator to the proper FastAllocator*/
56 parent->bind(alloc_i: alloc);
57
58 assert(align <= maxAlignment);
59 bytesUsed += bytes;
60
61 /* try to allocate in local block */
62 size_t ofs = (align - cur) & (align-1);
63 cur += bytes + ofs;
64 if (likely(cur <= end)) { bytesWasted += ofs; return &ptr[cur - bytes]; }
65 cur -= bytes + ofs;
66
67 /* if allocation is too large allocate with parent allocator */
68 if (4*bytes > allocBlockSize) {
69 return alloc->malloc(bytes,align: maxAlignment,partial: false);
70 }
71
72 /* get new partial block if allocation failed */
73 size_t blockSize = allocBlockSize;
74 ptr = (char*) alloc->malloc(bytes&: blockSize,align: maxAlignment,partial: true);
75 bytesWasted += end-cur;
76 cur = 0; end = blockSize;
77
78 /* retry allocation */
79 ofs = (align - cur) & (align-1);
80 cur += bytes + ofs;
81 if (likely(cur <= end)) { bytesWasted += ofs; return &ptr[cur - bytes]; }
82 cur -= bytes + ofs;
83
84 /* get new full block if allocation failed */
85 blockSize = allocBlockSize;
86 ptr = (char*) alloc->malloc(bytes&: blockSize,align: maxAlignment,partial: false);
87 bytesWasted += end-cur;
88 cur = 0; end = blockSize;
89
90 /* retry allocation */
91 ofs = (align - cur) & (align-1);
92 cur += bytes + ofs;
93 if (likely(cur <= end)) { bytesWasted += ofs; return &ptr[cur - bytes]; }
94 cur -= bytes + ofs;
95
96 /* should never happen as large allocations get handled specially above */
97 assert(false);
98 return nullptr;
99 }
100
101
102 /*! returns amount of used bytes */
103 __forceinline size_t getUsedBytes() const { return bytesUsed; }
104
105 /*! returns amount of free bytes */
106 __forceinline size_t getFreeBytes() const { return end-cur; }
107
108 /*! returns amount of wasted bytes */
109 __forceinline size_t getWastedBytes() const { return bytesWasted; }
110
111 private:
112 ThreadLocal2* parent;
113 char* ptr; //!< pointer to memory block
114 size_t cur; //!< current location of the allocator
115 size_t end; //!< end of the memory block
116 size_t allocBlockSize; //!< block size for allocations
117 size_t bytesUsed; //!< number of total bytes allocated
118 size_t bytesWasted; //!< number of bytes wasted
119 };
120
121 /*! Two thread local structures. */
122 struct __aligned(64) ThreadLocal2
123 {
124 ALIGNED_CLASS_(64);
125 public:
126
127 __forceinline ThreadLocal2()
128 : alloc(nullptr), alloc0(this), alloc1(this) {}
129
130 /*! bind to fast allocator */
131 __forceinline void bind(FastAllocator* alloc_i)
132 {
133 assert(alloc_i);
134 if (alloc.load() == alloc_i) return;
135 Lock<SpinLock> lock(mutex);
136 //if (alloc.load() == alloc_i) return; // not required as only one thread calls bind
137 if (alloc.load()) {
138 alloc.load()->bytesUsed += alloc0.getUsedBytes() + alloc1.getUsedBytes();
139 alloc.load()->bytesFree += alloc0.getFreeBytes() + alloc1.getFreeBytes();
140 alloc.load()->bytesWasted += alloc0.getWastedBytes() + alloc1.getWastedBytes();
141 }
142 alloc0.init(alloc: alloc_i);
143 alloc1.init(alloc: alloc_i);
144 alloc.store(p: alloc_i);
145 alloc_i->join(alloc: this);
146 }
147
148 /*! unbind to fast allocator */
149 void unbind(FastAllocator* alloc_i)
150 {
151 assert(alloc_i);
152 if (alloc.load() != alloc_i) return;
153 Lock<SpinLock> lock(mutex);
154 if (alloc.load() != alloc_i) return; // required as a different thread calls unbind
155 alloc.load()->bytesUsed += alloc0.getUsedBytes() + alloc1.getUsedBytes();
156 alloc.load()->bytesFree += alloc0.getFreeBytes() + alloc1.getFreeBytes();
157 alloc.load()->bytesWasted += alloc0.getWastedBytes() + alloc1.getWastedBytes();
158 alloc0.init(alloc: nullptr);
159 alloc1.init(alloc: nullptr);
160 alloc.store(p: nullptr);
161 }
162
163 public:
164 SpinLock mutex; //!< required as unbind is called from other threads
165 std::atomic<FastAllocator*> alloc; //!< parent allocator
166 ThreadLocal alloc0;
167 ThreadLocal alloc1;
168 };
169
170 FastAllocator (Device* device, bool osAllocation)
171 : device(device), slotMask(0), usedBlocks(nullptr), freeBlocks(nullptr), use_single_mode(false), defaultBlockSize(PAGE_SIZE), estimatedSize(0),
172 growSize(PAGE_SIZE), maxGrowSize(maxAllocationSize), log2_grow_size_scale(0), bytesUsed(0), bytesFree(0), bytesWasted(0), atype(osAllocation ? OS_MALLOC : ALIGNED_MALLOC),
173 primrefarray(device,0)
174 {
175 for (size_t i=0; i<MAX_THREAD_USED_BLOCK_SLOTS; i++)
176 {
177 threadUsedBlocks[i] = nullptr;
178 threadBlocks[i] = nullptr;
179 assert(!slotMutex[i].isLocked());
180 }
181 }
182
183 ~FastAllocator () {
184 clear();
185 }
186
187 /*! returns the device attached to this allocator */
188 Device* getDevice() {
189 return device;
190 }
191
192 void share(mvector<PrimRef>& primrefarray_i) {
193 primrefarray = std::move(primrefarray_i);
194 }
195
196 void unshare(mvector<PrimRef>& primrefarray_o)
197 {
198 reset(); // this removes blocks that are allocated inside the shared primref array
199 primrefarray_o = std::move(primrefarray);
200 }
201
202 /*! returns first fast thread local allocator */
203 __forceinline ThreadLocal* _threadLocal() {
204 return &threadLocal2()->alloc0;
205 }
206
207 void setOSallocation(bool flag)
208 {
209 atype = flag ? OS_MALLOC : ALIGNED_MALLOC;
210 }
211
212 private:
213
214 /*! returns both fast thread local allocators */
215 __forceinline ThreadLocal2* threadLocal2()
216 {
217 ThreadLocal2* alloc = thread_local_allocator2;
218 if (alloc == nullptr) {
219 thread_local_allocator2 = alloc = new ThreadLocal2;
220 Lock<SpinLock> lock(s_thread_local_allocators_lock);
221 s_thread_local_allocators.push_back(x: make_unique(ptr: alloc));
222 }
223 return alloc;
224 }
225
226 public:
227
228 __forceinline void join(ThreadLocal2* alloc)
229 {
230 Lock<SpinLock> lock(thread_local_allocators_lock);
231 thread_local_allocators.push_back(x: alloc);
232 }
233
234 public:
235
236 struct CachedAllocator
237 {
238 __forceinline CachedAllocator(void* ptr)
239 : alloc(nullptr), talloc0(nullptr), talloc1(nullptr)
240 {
241 assert(ptr == nullptr);
242 }
243
244 __forceinline CachedAllocator(FastAllocator* alloc, ThreadLocal2* talloc)
245 : alloc(alloc), talloc0(&talloc->alloc0), talloc1(alloc->use_single_mode ? &talloc->alloc0 : &talloc->alloc1) {}
246
247 __forceinline operator bool () const {
248 return alloc != nullptr;
249 }
250
251 __forceinline void* operator() (size_t bytes, size_t align = 16) const {
252 return talloc0->malloc(alloc,bytes,align);
253 }
254
255 __forceinline void* malloc0 (size_t bytes, size_t align = 16) const {
256 return talloc0->malloc(alloc,bytes,align);
257 }
258
259 __forceinline void* malloc1 (size_t bytes, size_t align = 16) const {
260 return talloc1->malloc(alloc,bytes,align);
261 }
262
263 public:
264 FastAllocator* alloc;
265 ThreadLocal* talloc0;
266 ThreadLocal* talloc1;
267 };
268
269 __forceinline CachedAllocator getCachedAllocator() {
270 return CachedAllocator(this,threadLocal2());
271 }
272
273 /*! Builder interface to create thread local allocator */
274 struct Create
275 {
276 public:
277 __forceinline Create (FastAllocator* allocator) : allocator(allocator) {}
278 __forceinline CachedAllocator operator() () const { return allocator->getCachedAllocator(); }
279
280 private:
281 FastAllocator* allocator;
282 };
283
284 void internal_fix_used_blocks()
285 {
286 /* move thread local blocks to global block list */
287 for (size_t i = 0; i < MAX_THREAD_USED_BLOCK_SLOTS; i++)
288 {
289 while (threadBlocks[i].load() != nullptr) {
290 Block* nextUsedBlock = threadBlocks[i].load()->next;
291 threadBlocks[i].load()->next = usedBlocks.load();
292 usedBlocks = threadBlocks[i].load();
293 threadBlocks[i] = nextUsedBlock;
294 }
295 threadBlocks[i] = nullptr;
296 }
297 }
298
299 static const size_t threadLocalAllocOverhead = 20; //! 20 means 5% parallel allocation overhead through unfilled thread local blocks
300 static const size_t mainAllocOverheadStatic = 20; //! 20 means 5% allocation overhead through unfilled main alloc blocks
301 static const size_t mainAllocOverheadDynamic = 8; //! 20 means 12.5% allocation overhead through unfilled main alloc blocks
302
303 /* calculates a single threaded threshold for the builders such
304 * that for small scenes the overhead of partly allocated blocks
305 * per thread is low */
306 size_t fixSingleThreadThreshold(size_t branchingFactor, size_t defaultThreshold, size_t numPrimitives, size_t bytesEstimated)
307 {
308 if (numPrimitives == 0 || bytesEstimated == 0)
309 return defaultThreshold;
310
311 /* calculate block size in bytes to fulfill threadLocalAllocOverhead constraint */
312 const size_t single_mode_factor = use_single_mode ? 1 : 2;
313 const size_t threadCount = TaskScheduler::threadCount();
314 const size_t singleThreadBytes = single_mode_factor*threadLocalAllocOverhead*defaultBlockSize;
315
316 /* if we do not have to limit number of threads use optimal thresdhold */
317 if ( (bytesEstimated+(singleThreadBytes-1))/singleThreadBytes >= threadCount)
318 return defaultThreshold;
319
320 /* otherwise limit number of threads by calculating proper single thread threshold */
321 else {
322 double bytesPerPrimitive = double(bytesEstimated)/double(numPrimitives);
323 return size_t(ceil(x: branchingFactor*singleThreadBytes/bytesPerPrimitive));
324 }
325 }
326
327 __forceinline size_t alignSize(size_t i) {
328 return (i+127)/128*128;
329 }
330
331 /*! initializes the grow size */
332 __forceinline void initGrowSizeAndNumSlots(size_t bytesEstimated, bool fast)
333 {
334 /* we do not need single thread local allocator mode */
335 use_single_mode = false;
336
337 /* calculate growSize such that at most mainAllocationOverhead gets wasted when a block stays unused */
338 size_t mainAllocOverhead = fast ? mainAllocOverheadDynamic : mainAllocOverheadStatic;
339 size_t blockSize = alignSize(i: bytesEstimated/mainAllocOverhead);
340 growSize = maxGrowSize = clamp(x: blockSize,lower: size_t(1024),maxAllocationSize);
341
342 /* if we reached the maxAllocationSize for growSize, we can
343 * increase the number of allocation slots by still guaranteeing
344 * the mainAllocationOverhead */
345 slotMask = 0x0;
346
347 if (MAX_THREAD_USED_BLOCK_SLOTS >= 2 && bytesEstimated > 2*mainAllocOverhead*growSize) slotMask = 0x1;
348 if (MAX_THREAD_USED_BLOCK_SLOTS >= 4 && bytesEstimated > 4*mainAllocOverhead*growSize) slotMask = 0x3;
349 if (MAX_THREAD_USED_BLOCK_SLOTS >= 8 && bytesEstimated > 8*mainAllocOverhead*growSize) slotMask = 0x7;
350 if (MAX_THREAD_USED_BLOCK_SLOTS >= 8 && bytesEstimated > 16*mainAllocOverhead*growSize) { growSize *= 2; } /* if the overhead is tiny, double the growSize */
351
352 /* set the thread local alloc block size */
353 size_t defaultBlockSizeSwitch = PAGE_SIZE+maxAlignment;
354
355 /* for sufficiently large scene we can increase the defaultBlockSize over the defaultBlockSizeSwitch size */
356#if 0 // we do not do this as a block size of 4160 if for some reason best for KNL
357 const size_t threadCount = TaskScheduler::threadCount();
358 const size_t single_mode_factor = use_single_mode ? 1 : 2;
359 const size_t singleThreadBytes = single_mode_factor*threadLocalAllocOverhead*defaultBlockSizeSwitch;
360 if (bytesEstimated+(singleThreadBytes-1))/singleThreadBytes >= threadCount)
361 defaultBlockSize = min(max(defaultBlockSizeSwitch,bytesEstimated/(single_mode_factor*threadLocalAllocOverhead*threadCount)),growSize);
362
363 /* otherwise we grow the defaultBlockSize up to defaultBlockSizeSwitch */
364 else
365#endif
366 defaultBlockSize = clamp(x: blockSize,lower: size_t(1024),upper: defaultBlockSizeSwitch);
367
368 if (bytesEstimated == 0) {
369 maxGrowSize = maxAllocationSize; // special mode if builder cannot estimate tree size
370 defaultBlockSize = defaultBlockSizeSwitch;
371 }
372 log2_grow_size_scale = 0;
373
374 if (device->alloc_main_block_size != 0) growSize = device->alloc_main_block_size;
375 if (device->alloc_num_main_slots >= 1 ) slotMask = 0x0;
376 if (device->alloc_num_main_slots >= 2 ) slotMask = 0x1;
377 if (device->alloc_num_main_slots >= 4 ) slotMask = 0x3;
378 if (device->alloc_num_main_slots >= 8 ) slotMask = 0x7;
379 if (device->alloc_thread_block_size != 0) defaultBlockSize = device->alloc_thread_block_size;
380 if (device->alloc_single_thread_alloc != -1) use_single_mode = device->alloc_single_thread_alloc;
381 }
382
383 /*! initializes the allocator */
384 void init(size_t bytesAllocate, size_t bytesReserve, size_t bytesEstimate)
385 {
386 internal_fix_used_blocks();
387 /* distribute the allocation to multiple thread block slots */
388 slotMask = MAX_THREAD_USED_BLOCK_SLOTS-1; // FIXME: remove
389 if (usedBlocks.load() || freeBlocks.load()) { reset(); return; }
390 if (bytesReserve == 0) bytesReserve = bytesAllocate;
391 freeBlocks = Block::create(device,bytesAllocate,bytesReserve,next: nullptr,atype);
392 estimatedSize = bytesEstimate;
393 initGrowSizeAndNumSlots(bytesEstimated: bytesEstimate,fast: true);
394 }
395
396 /*! initializes the allocator */
397 void init_estimate(size_t bytesEstimate)
398 {
399 internal_fix_used_blocks();
400 if (usedBlocks.load() || freeBlocks.load()) { reset(); return; }
401 /* single allocator mode ? */
402 estimatedSize = bytesEstimate;
403 //initGrowSizeAndNumSlots(bytesEstimate,false);
404 initGrowSizeAndNumSlots(bytesEstimated: bytesEstimate,fast: false);
405
406 }
407
408 /*! frees state not required after build */
409 __forceinline void cleanup()
410 {
411 internal_fix_used_blocks();
412
413 /* unbind all thread local allocators */
414 for (auto alloc : thread_local_allocators) alloc->unbind(alloc_i: this);
415 thread_local_allocators.clear();
416 }
417
418 /*! resets the allocator, memory blocks get reused */
419 void reset ()
420 {
421 internal_fix_used_blocks();
422
423 bytesUsed.store(i: 0);
424 bytesFree.store(i: 0);
425 bytesWasted.store(i: 0);
426
427 /* reset all used blocks and move them to begin of free block list */
428 while (usedBlocks.load() != nullptr) {
429 usedBlocks.load()->reset_block();
430 Block* nextUsedBlock = usedBlocks.load()->next;
431 usedBlocks.load()->next = freeBlocks.load();
432 freeBlocks = usedBlocks.load();
433 usedBlocks = nextUsedBlock;
434 }
435
436 /* remove all shared blocks as they are re-added during build */
437 freeBlocks.store(p: Block::remove_shared_blocks(head: freeBlocks.load()));
438
439 for (size_t i=0; i<MAX_THREAD_USED_BLOCK_SLOTS; i++)
440 {
441 threadUsedBlocks[i] = nullptr;
442 threadBlocks[i] = nullptr;
443 }
444
445 /* unbind all thread local allocators */
446 for (auto alloc : thread_local_allocators) alloc->unbind(alloc_i: this);
447 thread_local_allocators.clear();
448 }
449
450 /*! frees all allocated memory */
451 __forceinline void clear()
452 {
453 cleanup();
454 bytesUsed.store(i: 0);
455 bytesFree.store(i: 0);
456 bytesWasted.store(i: 0);
457 if (usedBlocks.load() != nullptr) usedBlocks.load()->clear_list(device); usedBlocks = nullptr;
458 if (freeBlocks.load() != nullptr) freeBlocks.load()->clear_list(device); freeBlocks = nullptr;
459 for (size_t i=0; i<MAX_THREAD_USED_BLOCK_SLOTS; i++) {
460 threadUsedBlocks[i] = nullptr;
461 threadBlocks[i] = nullptr;
462 }
463 primrefarray.clear();
464 }
465
466 __forceinline size_t incGrowSizeScale()
467 {
468 size_t scale = log2_grow_size_scale.fetch_add(i: 1)+1;
469 return size_t(1) << min(a: size_t(16),b: scale);
470 }
471
472 /*! thread safe allocation of memory */
473 void* malloc(size_t& bytes, size_t align, bool partial)
474 {
475 assert(align <= maxAlignment);
476
477 while (true)
478 {
479 /* allocate using current block */
480 size_t threadID = TaskScheduler::threadID();
481 size_t slot = threadID & slotMask;
482 Block* myUsedBlocks = threadUsedBlocks[slot];
483 if (myUsedBlocks) {
484 void* ptr = myUsedBlocks->malloc(device,bytes_in&: bytes,align,partial);
485 if (ptr) return ptr;
486 }
487
488 /* throw error if allocation is too large */
489 if (bytes > maxAllocationSize)
490 throw_RTCError(RTC_ERROR_UNKNOWN,"allocation is too large");
491
492 /* parallel block creation in case of no freeBlocks, avoids single global mutex */
493 if (likely(freeBlocks.load() == nullptr))
494 {
495 Lock<SpinLock> lock(slotMutex[slot]);
496 if (myUsedBlocks == threadUsedBlocks[slot]) {
497 const size_t alignedBytes = (bytes+(align-1)) & ~(align-1);
498 const size_t allocSize = max(a: min(a: growSize,b: maxGrowSize),b: alignedBytes);
499 assert(allocSize >= bytes);
500 threadBlocks[slot] = threadUsedBlocks[slot] = Block::create(device,bytesAllocate: allocSize,bytesReserve: allocSize,next: threadBlocks[slot],atype); // FIXME: a large allocation might throw away a block here!
501 // FIXME: a direct allocation should allocate inside the block here, and not in the next loop! a different thread could do some allocation and make the large allocation fail.
502 }
503 continue;
504 }
505
506 /* if this fails allocate new block */
507 {
508 Lock<SpinLock> lock(mutex);
509 if (myUsedBlocks == threadUsedBlocks[slot])
510 {
511 if (freeBlocks.load() != nullptr) {
512 Block* nextFreeBlock = freeBlocks.load()->next;
513 freeBlocks.load()->next = usedBlocks;
514 __memory_barrier();
515 usedBlocks = freeBlocks.load();
516 threadUsedBlocks[slot] = freeBlocks.load();
517 freeBlocks = nextFreeBlock;
518 } else {
519 const size_t allocSize = min(a: growSize*incGrowSizeScale(),b: maxGrowSize);
520 usedBlocks = threadUsedBlocks[slot] = Block::create(device,bytesAllocate: allocSize,bytesReserve: allocSize,next: usedBlocks,atype); // FIXME: a large allocation should get delivered directly, like above!
521 }
522 }
523 }
524 }
525 }
526
527 /*! add new block */
528 void addBlock(void* ptr, ssize_t bytes)
529 {
530 Lock<SpinLock> lock(mutex);
531 const size_t sizeof_Header = offsetof(Block,data[0]);
532 void* aptr = (void*) ((((size_t)ptr)+maxAlignment-1) & ~(maxAlignment-1));
533 size_t ofs = (size_t) aptr - (size_t) ptr;
534 bytes -= ofs;
535 if (bytes < 4096) return; // ignore empty or very small blocks
536 freeBlocks = new (aptr) Block(SHARED,bytes-sizeof_Header,bytes-sizeof_Header,freeBlocks,ofs);
537 }
538
539 /* special allocation only used from morton builder only a single time for each build */
540 void* specialAlloc(size_t bytes)
541 {
542 assert(freeBlocks.load() != nullptr && freeBlocks.load()->getBlockAllocatedBytes() >= bytes);
543 return freeBlocks.load()->ptr();
544 }
545
546 struct Statistics
547 {
548 Statistics ()
549 : bytesUsed(0), bytesFree(0), bytesWasted(0) {}
550
551 Statistics (size_t bytesUsed, size_t bytesFree, size_t bytesWasted)
552 : bytesUsed(bytesUsed), bytesFree(bytesFree), bytesWasted(bytesWasted) {}
553
554 Statistics (FastAllocator* alloc, AllocationType atype, bool huge_pages = false)
555 : bytesUsed(0), bytesFree(0), bytesWasted(0)
556 {
557 Block* usedBlocks = alloc->usedBlocks.load();
558 Block* freeBlocks = alloc->freeBlocks.load();
559 if (usedBlocks) bytesUsed += usedBlocks->getUsedBytes(atype,huge_pages);
560 if (freeBlocks) bytesFree += freeBlocks->getAllocatedBytes(atype,huge_pages);
561 if (usedBlocks) bytesFree += usedBlocks->getFreeBytes(atype,huge_pages);
562 if (freeBlocks) bytesWasted += freeBlocks->getWastedBytes(atype,huge_pages);
563 if (usedBlocks) bytesWasted += usedBlocks->getWastedBytes(atype,huge_pages);
564 }
565
566 std::string str(size_t numPrimitives)
567 {
568 std::stringstream str;
569 str.setf(fmtfl: std::ios::fixed, mask: std::ios::floatfield);
570 str << "used = " << std::setw(7) << std::setprecision(3) << 1E-6f*bytesUsed << " MB, "
571 << "free = " << std::setw(7) << std::setprecision(3) << 1E-6f*bytesFree << " MB, "
572 << "wasted = " << std::setw(7) << std::setprecision(3) << 1E-6f*bytesWasted << " MB, "
573 << "total = " << std::setw(7) << std::setprecision(3) << 1E-6f*bytesAllocatedTotal() << " MB, "
574 << "#bytes/prim = " << std::setw(6) << std::setprecision(2) << double(bytesAllocatedTotal())/double(numPrimitives);
575 return str.str();
576 }
577
578 friend Statistics operator+ ( const Statistics& a, const Statistics& b)
579 {
580 return Statistics(a.bytesUsed+b.bytesUsed,
581 a.bytesFree+b.bytesFree,
582 a.bytesWasted+b.bytesWasted);
583 }
584
585 size_t bytesAllocatedTotal() const {
586 return bytesUsed + bytesFree + bytesWasted;
587 }
588
589 public:
590 size_t bytesUsed;
591 size_t bytesFree;
592 size_t bytesWasted;
593 };
594
595 Statistics getStatistics(AllocationType atype, bool huge_pages = false) {
596 return Statistics(this,atype,huge_pages);
597 }
598
599 size_t getUsedBytes() {
600 return bytesUsed;
601 }
602
603 size_t getWastedBytes() {
604 return bytesWasted;
605 }
606
607 struct AllStatistics
608 {
609 AllStatistics (FastAllocator* alloc)
610
611 : bytesUsed(alloc->bytesUsed),
612 bytesFree(alloc->bytesFree),
613 bytesWasted(alloc->bytesWasted),
614 stat_all(alloc,ANY_TYPE),
615 stat_malloc(alloc,ALIGNED_MALLOC),
616 stat_4K(alloc,OS_MALLOC,false),
617 stat_2M(alloc,OS_MALLOC,true),
618 stat_shared(alloc,SHARED) {}
619
620 AllStatistics (size_t bytesUsed,
621 size_t bytesFree,
622 size_t bytesWasted,
623 Statistics stat_all,
624 Statistics stat_malloc,
625 Statistics stat_4K,
626 Statistics stat_2M,
627 Statistics stat_shared)
628
629 : bytesUsed(bytesUsed),
630 bytesFree(bytesFree),
631 bytesWasted(bytesWasted),
632 stat_all(stat_all),
633 stat_malloc(stat_malloc),
634 stat_4K(stat_4K),
635 stat_2M(stat_2M),
636 stat_shared(stat_shared) {}
637
638 friend AllStatistics operator+ (const AllStatistics& a, const AllStatistics& b)
639 {
640 return AllStatistics(a.bytesUsed+b.bytesUsed,
641 a.bytesFree+b.bytesFree,
642 a.bytesWasted+b.bytesWasted,
643 a.stat_all + b.stat_all,
644 a.stat_malloc + b.stat_malloc,
645 a.stat_4K + b.stat_4K,
646 a.stat_2M + b.stat_2M,
647 a.stat_shared + b.stat_shared);
648 }
649
650 void print(size_t numPrimitives)
651 {
652 std::stringstream str0;
653 str0.setf(fmtfl: std::ios::fixed, mask: std::ios::floatfield);
654 str0 << " alloc : "
655 << "used = " << std::setw(7) << std::setprecision(3) << 1E-6f*bytesUsed << " MB, "
656 << " "
657 << "#bytes/prim = " << std::setw(6) << std::setprecision(2) << double(bytesUsed)/double(numPrimitives);
658 std::cout << str0.str() << std::endl;
659
660 std::stringstream str1;
661 str1.setf(fmtfl: std::ios::fixed, mask: std::ios::floatfield);
662 str1 << " alloc : "
663 << "used = " << std::setw(7) << std::setprecision(3) << 1E-6f*bytesUsed << " MB, "
664 << "free = " << std::setw(7) << std::setprecision(3) << 1E-6f*bytesFree << " MB, "
665 << "wasted = " << std::setw(7) << std::setprecision(3) << 1E-6f*bytesWasted << " MB, "
666 << "total = " << std::setw(7) << std::setprecision(3) << 1E-6f*(bytesUsed+bytesFree+bytesWasted) << " MB, "
667 << "#bytes/prim = " << std::setw(6) << std::setprecision(2) << double(bytesUsed+bytesFree+bytesWasted)/double(numPrimitives);
668 std::cout << str1.str() << std::endl;
669
670 std::cout << " total : " << stat_all.str(numPrimitives) << std::endl;
671 std::cout << " 4K : " << stat_4K.str(numPrimitives) << std::endl;
672 std::cout << " 2M : " << stat_2M.str(numPrimitives) << std::endl;
673 std::cout << " malloc: " << stat_malloc.str(numPrimitives) << std::endl;
674 std::cout << " shared: " << stat_shared.str(numPrimitives) << std::endl;
675 }
676
677 private:
678 size_t bytesUsed;
679 size_t bytesFree;
680 size_t bytesWasted;
681 Statistics stat_all;
682 Statistics stat_malloc;
683 Statistics stat_4K;
684 Statistics stat_2M;
685 Statistics stat_shared;
686 };
687
688 void print_blocks()
689 {
690 std::cout << " estimatedSize = " << estimatedSize << ", slotMask = " << slotMask << ", use_single_mode = " << use_single_mode << ", maxGrowSize = " << maxGrowSize << ", defaultBlockSize = " << defaultBlockSize << std::endl;
691
692 std::cout << " used blocks = ";
693 if (usedBlocks.load() != nullptr) usedBlocks.load()->print_list();
694 std::cout << "[END]" << std::endl;
695
696 std::cout << " free blocks = ";
697 if (freeBlocks.load() != nullptr) freeBlocks.load()->print_list();
698 std::cout << "[END]" << std::endl;
699 }
700
701 private:
702
703 struct Block
704 {
705 static Block* create(MemoryMonitorInterface* device, size_t bytesAllocate, size_t bytesReserve, Block* next, AllocationType atype)
706 {
707 /* We avoid using os_malloc for small blocks as this could
708 * cause a risk of fragmenting the virtual address space and
709 * reach the limit of vm.max_map_count = 65k under Linux. */
710 if (atype == OS_MALLOC && bytesAllocate < maxAllocationSize)
711 atype = ALIGNED_MALLOC;
712
713 /* we need to additionally allocate some header */
714 const size_t sizeof_Header = offsetof(Block,data[0]);
715 bytesAllocate = sizeof_Header+bytesAllocate;
716 bytesReserve = sizeof_Header+bytesReserve;
717
718 /* consume full 4k pages with using os_malloc */
719 if (atype == OS_MALLOC) {
720 bytesAllocate = ((bytesAllocate+PAGE_SIZE-1) & ~(PAGE_SIZE-1));
721 bytesReserve = ((bytesReserve +PAGE_SIZE-1) & ~(PAGE_SIZE-1));
722 }
723
724 /* either use alignedMalloc or os_malloc */
725 void *ptr = nullptr;
726 if (atype == ALIGNED_MALLOC)
727 {
728 /* special handling for default block size */
729 if (bytesAllocate == (2*PAGE_SIZE_2M))
730 {
731 const size_t alignment = maxAlignment;
732 if (device) device->memoryMonitor(bytes: bytesAllocate+alignment,post: false);
733 ptr = alignedMalloc(size: bytesAllocate,align: alignment);
734
735 /* give hint to transparently convert these pages to 2MB pages */
736 const size_t ptr_aligned_begin = ((size_t)ptr) & ~size_t(PAGE_SIZE_2M-1);
737 os_advise(ptr: (void*)(ptr_aligned_begin + 0),PAGE_SIZE_2M); // may fail if no memory mapped before block
738 os_advise(ptr: (void*)(ptr_aligned_begin + 1*PAGE_SIZE_2M),PAGE_SIZE_2M);
739 os_advise(ptr: (void*)(ptr_aligned_begin + 2*PAGE_SIZE_2M),PAGE_SIZE_2M); // may fail if no memory mapped after block
740
741 return new (ptr) Block(ALIGNED_MALLOC,bytesAllocate-sizeof_Header,bytesAllocate-sizeof_Header,next,alignment);
742 }
743 else
744 {
745 const size_t alignment = maxAlignment;
746 if (device) device->memoryMonitor(bytes: bytesAllocate+alignment,post: false);
747 ptr = alignedMalloc(size: bytesAllocate,align: alignment);
748 return new (ptr) Block(ALIGNED_MALLOC,bytesAllocate-sizeof_Header,bytesAllocate-sizeof_Header,next,alignment);
749 }
750 }
751 else if (atype == OS_MALLOC)
752 {
753 if (device) device->memoryMonitor(bytes: bytesAllocate,post: false);
754 bool huge_pages; ptr = os_malloc(bytes: bytesReserve,hugepages&: huge_pages);
755 return new (ptr) Block(OS_MALLOC,bytesAllocate-sizeof_Header,bytesReserve-sizeof_Header,next,0,huge_pages);
756 }
757 else
758 assert(false);
759
760 return NULL;
761 }
762
763 Block (AllocationType atype, size_t bytesAllocate, size_t bytesReserve, Block* next, size_t wasted, bool huge_pages = false)
764 : cur(0), allocEnd(bytesAllocate), reserveEnd(bytesReserve), next(next), wasted(wasted), atype(atype), huge_pages(huge_pages)
765 {
766 assert((((size_t)&data[0]) & (maxAlignment-1)) == 0);
767 }
768
769 static Block* remove_shared_blocks(Block* head)
770 {
771 Block** prev_next = &head;
772 for (Block* block = head; block; block = block->next) {
773 if (block->atype == SHARED) *prev_next = block->next;
774 else prev_next = &block->next;
775 }
776 return head;
777 }
778
779 void clear_list(MemoryMonitorInterface* device)
780 {
781 Block* block = this;
782 while (block) {
783 Block* next = block->next;
784 block->clear_block(device);
785 block = next;
786 }
787 }
788
789 void clear_block (MemoryMonitorInterface* device)
790 {
791 const size_t sizeof_Header = offsetof(Block,data[0]);
792 const ssize_t sizeof_Alloced = wasted+sizeof_Header+getBlockAllocatedBytes();
793
794 if (atype == ALIGNED_MALLOC) {
795 alignedFree(ptr: this);
796 if (device) device->memoryMonitor(bytes: -sizeof_Alloced,post: true);
797 }
798
799 else if (atype == OS_MALLOC) {
800 size_t sizeof_This = sizeof_Header+reserveEnd;
801 os_free(ptr: this,bytes: sizeof_This,hugepages: huge_pages);
802 if (device) device->memoryMonitor(bytes: -sizeof_Alloced,post: true);
803 }
804
805 else /* if (atype == SHARED) */ {
806 }
807 }
808
809 void* malloc(MemoryMonitorInterface* device, size_t& bytes_in, size_t align, bool partial)
810 {
811 size_t bytes = bytes_in;
812 assert(align <= maxAlignment);
813 bytes = (bytes+(align-1)) & ~(align-1);
814 if (unlikely(cur+bytes > reserveEnd && !partial)) return nullptr;
815 const size_t i = cur.fetch_add(i: bytes);
816 if (unlikely(i+bytes > reserveEnd && !partial)) return nullptr;
817 if (unlikely(i > reserveEnd)) return nullptr;
818 bytes_in = bytes = min(a: bytes,b: reserveEnd-i);
819
820 if (i+bytes > allocEnd) {
821 if (device) device->memoryMonitor(bytes: i+bytes-max(a: i,b: allocEnd),post: true);
822 }
823 return &data[i];
824 }
825
826 void* ptr() {
827 return &data[cur];
828 }
829
830 void reset_block ()
831 {
832 allocEnd = max(a: allocEnd,b: (size_t)cur);
833 cur = 0;
834 }
835
836 size_t getBlockUsedBytes() const {
837 return min(a: size_t(cur),b: reserveEnd);
838 }
839
840 size_t getBlockFreeBytes() const {
841 return getBlockAllocatedBytes() - getBlockUsedBytes();
842 }
843
844 size_t getBlockAllocatedBytes() const {
845 return min(a: max(a: allocEnd,b: size_t(cur)),b: reserveEnd);
846 }
847
848 size_t getBlockWastedBytes() const {
849 const size_t sizeof_Header = offsetof(Block,data[0]);
850 return sizeof_Header + wasted;
851 }
852
853 size_t getBlockReservedBytes() const {
854 return reserveEnd;
855 }
856
857 bool hasType(AllocationType atype_i, bool huge_pages_i) const
858 {
859 if (atype_i == ANY_TYPE ) return true;
860 else if (atype == OS_MALLOC) return atype_i == atype && huge_pages_i == huge_pages;
861 else return atype_i == atype;
862 }
863
864 size_t getUsedBytes(AllocationType atype, bool huge_pages = false) const {
865 size_t bytes = 0;
866 for (const Block* block = this; block; block = block->next) {
867 if (!block->hasType(atype_i: atype,huge_pages_i: huge_pages)) continue;
868 bytes += block->getBlockUsedBytes();
869 }
870 return bytes;
871 }
872
873 size_t getFreeBytes(AllocationType atype, bool huge_pages = false) const {
874 size_t bytes = 0;
875 for (const Block* block = this; block; block = block->next) {
876 if (!block->hasType(atype_i: atype,huge_pages_i: huge_pages)) continue;
877 bytes += block->getBlockFreeBytes();
878 }
879 return bytes;
880 }
881
882 size_t getWastedBytes(AllocationType atype, bool huge_pages = false) const {
883 size_t bytes = 0;
884 for (const Block* block = this; block; block = block->next) {
885 if (!block->hasType(atype_i: atype,huge_pages_i: huge_pages)) continue;
886 bytes += block->getBlockWastedBytes();
887 }
888 return bytes;
889 }
890
891 size_t getAllocatedBytes(AllocationType atype, bool huge_pages = false) const {
892 size_t bytes = 0;
893 for (const Block* block = this; block; block = block->next) {
894 if (!block->hasType(atype_i: atype,huge_pages_i: huge_pages)) continue;
895 bytes += block->getBlockAllocatedBytes();
896 }
897 return bytes;
898 }
899
900 void print_list ()
901 {
902 for (const Block* block = this; block; block = block->next)
903 block->print_block();
904 }
905
906 void print_block() const
907 {
908 if (atype == ALIGNED_MALLOC) std::cout << "A";
909 else if (atype == OS_MALLOC) std::cout << "O";
910 else if (atype == SHARED) std::cout << "S";
911 if (huge_pages) std::cout << "H";
912 size_t bytesUsed = getBlockUsedBytes();
913 size_t bytesFree = getBlockFreeBytes();
914 size_t bytesWasted = getBlockWastedBytes();
915 std::cout << "[" << bytesUsed << ", " << bytesFree << ", " << bytesWasted << "] ";
916 }
917
918 public:
919 std::atomic<size_t> cur; //!< current location of the allocator
920 std::atomic<size_t> allocEnd; //!< end of the allocated memory region
921 std::atomic<size_t> reserveEnd; //!< end of the reserved memory region
922 Block* next; //!< pointer to next block in list
923 size_t wasted; //!< amount of memory wasted through block alignment
924 AllocationType atype; //!< allocation mode of the block
925 bool huge_pages; //!< whether the block uses huge pages
926 char align[maxAlignment-5*sizeof(size_t)-sizeof(AllocationType)-sizeof(bool)]; //!< align data to maxAlignment
927 char data[1]; //!< here starts memory to use for allocations
928 };
929
930 private:
931 Device* device;
932 SpinLock mutex;
933 size_t slotMask;
934 std::atomic<Block*> threadUsedBlocks[MAX_THREAD_USED_BLOCK_SLOTS];
935 std::atomic<Block*> usedBlocks;
936 std::atomic<Block*> freeBlocks;
937
938 std::atomic<Block*> threadBlocks[MAX_THREAD_USED_BLOCK_SLOTS];
939 SpinLock slotMutex[MAX_THREAD_USED_BLOCK_SLOTS];
940
941 bool use_single_mode;
942 size_t defaultBlockSize;
943 size_t estimatedSize;
944 size_t growSize;
945 size_t maxGrowSize;
946 std::atomic<size_t> log2_grow_size_scale; //!< log2 of scaling factor for grow size // FIXME: remove
947 std::atomic<size_t> bytesUsed;
948 std::atomic<size_t> bytesFree;
949 std::atomic<size_t> bytesWasted;
950 static __thread ThreadLocal2* thread_local_allocator2;
951 static SpinLock s_thread_local_allocators_lock;
952 static std::vector<std::unique_ptr<ThreadLocal2>> s_thread_local_allocators;
953 SpinLock thread_local_allocators_lock;
954 std::vector<ThreadLocal2*> thread_local_allocators;
955 AllocationType atype;
956 mvector<PrimRef> primrefarray; //!< primrefarray used to allocate nodes
957 };
958}
959

source code of qtquick3d/src/3rdparty/embree/kernels/common/alloc.h