alloc.h source code [qtquick3d/src/3rdparty/embree/kernels/common/alloc.h]

1	// Copyright 2009-2021 Intel Corporation
2	// SPDX-License-Identifier: Apache-2.0
3
4	#pragma once
5
6	#include "default.h"
7	#include "device.h"
8	#include "scene.h"
9	#include "primref.h"
10
11	namespace embree
12	{
13	class FastAllocator
14	{
15	/! maximum supported alignment /
16	static const size_t maxAlignment = `64`;
17
18	/! maximum allocation size /
19
20	/ default settings /
21	//static const size_t defaultBlockSize = 4096;
22	#define maxAllocationSize size_t(210241024-maxAlignment)
23
24	static const size_t MAX_THREAD_USED_BLOCK_SLOTS = `8`;
25
26	public:
27
28	struct ThreadLocal2;
29	enum AllocationType { ALIGNED_MALLOC, OS_MALLOC, SHARED, ANY_TYPE };
30
31	/! Per thread structure holding the current memory block. /
32	struct __aligned(`64`) ThreadLocal
33	{
34	ALIGNED_CLASS_(`64`);
35	public:
36
37	/! Constructor for usage with ThreadLocalData /
38	__forceinline ThreadLocal (ThreadLocal2* parent)
39	: parent(parent), ptr(nullptr), cur(`0`), end(`0`), allocBlockSize(`0`), bytesUsed(`0`), bytesWasted(`0`) {}
40
41	/! initialize allocator /
42	void init(FastAllocator* alloc)
43	{
44	ptr = nullptr;
45	cur = end = `0`;
46	bytesUsed = `0`;
47	bytesWasted = `0`;
48	allocBlockSize = `0`;
49	if (alloc) allocBlockSize = alloc->defaultBlockSize;
50	}
51
52	/ Allocate aligned memory from the threads memory block. /
53	__forceinline void* malloc(FastAllocator* alloc, size_t bytes, size_t align = `16`)
54	{
55	/ bind the thread local allocator to the proper FastAllocator/
56	parent->bind(alloc_i: alloc);
57
58	assert(align <= maxAlignment);
59	bytesUsed += bytes;
60
61	/ try to allocate in local block /
62	size_t ofs = (align - cur) & (align-`1`);
63	cur += bytes + ofs;
64	if (likely(cur <= end)) { bytesWasted += ofs; return &ptr[cur - bytes]; }
65	cur -= bytes + ofs;
66
67	/ if allocation is too large allocate with parent allocator /
68	if (`4`*bytes > allocBlockSize) {
69	return alloc->malloc(bytes,align: maxAlignment,partial: false);
70	}
71
72	/ get new partial block if allocation failed /
73	size_t blockSize = allocBlockSize;
74	ptr = (char) alloc->malloc(bytes&: blockSize,align: maxAlignment,partial: true*);
75	bytesWasted += end-cur;
76	cur = `0`; end = blockSize;
77
78	/ retry allocation /
79	ofs = (align - cur) & (align-`1`);
80	cur += bytes + ofs;
81	if (likely(cur <= end)) { bytesWasted += ofs; return &ptr[cur - bytes]; }
82	cur -= bytes + ofs;
83
84	/ get new full block if allocation failed /
85	blockSize = allocBlockSize;
86	ptr = (char) alloc->malloc(bytes&: blockSize,align: maxAlignment,partial: false*);
87	bytesWasted += end-cur;
88	cur = `0`; end = blockSize;
89
90	/ retry allocation /
91	ofs = (align - cur) & (align-`1`);
92	cur += bytes + ofs;
93	if (likely(cur <= end)) { bytesWasted += ofs; return &ptr[cur - bytes]; }
94	cur -= bytes + ofs;
95
96	/ should never happen as large allocations get handled specially above /
97	assert(false);
98	return nullptr;
99	}
100
101
102	/! returns amount of used bytes /
103	__forceinline size_t getUsedBytes() const { return bytesUsed; }
104
105	/! returns amount of free bytes /
106	__forceinline size_t getFreeBytes() const { return end-cur; }
107
108	/! returns amount of wasted bytes /
109	__forceinline size_t getWastedBytes() const { return bytesWasted; }
110
111	private:
112	ThreadLocal2* parent;
113	char* ptr; //!< pointer to memory block
114	size_t cur; //!< current location of the allocator
115	size_t end; //!< end of the memory block
116	size_t allocBlockSize; //!< block size for allocations
117	size_t bytesUsed; //!< number of total bytes allocated
118	size_t bytesWasted; //!< number of bytes wasted
119	};
120
121	/! Two thread local structures. /
122	struct __aligned(`64`) ThreadLocal2
123	{
124	ALIGNED_CLASS_(`64`);
125	public:
126
127	__forceinline ThreadLocal2()
128	: alloc (nullptr), alloc0 (this), alloc1 (this) {}
129
130	/! bind to fast allocator /
131	__forceinline void bind(FastAllocator* alloc_i)
132	{
133	assert(alloc_i);
134	if (alloc.load() == alloc_i) return;
135	Lock<SpinLock> lock(mutex);
136	//if (alloc.load() == alloc_i) return; // not required as only one thread calls bind
137	if (alloc.load()) {
138	alloc.load()->bytesUsed += alloc0.getUsedBytes() + alloc1.getUsedBytes();
139	alloc.load()->bytesFree += alloc0.getFreeBytes() + alloc1.getFreeBytes();
140	alloc.load()->bytesWasted += alloc0.getWastedBytes() + alloc1.getWastedBytes();
141	}
142	alloc0.init(alloc: alloc_i);
143	alloc1.init(alloc: alloc_i);
144	alloc.store(p: alloc_i);
145	alloc_i->join(alloc: this);
146	}
147
148	/! unbind to fast allocator /
149	void unbind(FastAllocator* alloc_i)
150	{
151	assert(alloc_i);
152	if (alloc.load() != alloc_i) return;
153	Lock<SpinLock> lock(mutex);
154	if (alloc.load() != alloc_i) return; // required as a different thread calls unbind
155	alloc.load()->bytesUsed += alloc0.getUsedBytes() + alloc1.getUsedBytes();
156	alloc.load()->bytesFree += alloc0.getFreeBytes() + alloc1.getFreeBytes();
157	alloc.load()->bytesWasted += alloc0.getWastedBytes() + alloc1.getWastedBytes();
158	alloc0.init(alloc: nullptr);
159	alloc1.init(alloc: nullptr);
160	alloc.store(p: nullptr);
161	}
162
163	public:
164	SpinLock mutex; //!< required as unbind is called from other threads
165	std::atomic<FastAllocator> alloc; //!< parent allocator*
166	ThreadLocal alloc0;
167	ThreadLocal alloc1;
168	};
169
170	FastAllocator (Device* device, bool osAllocation)
171	: device(device), slotMask(`0`), usedBlocks (nullptr), freeBlocks (nullptr), use_single_mode(false), defaultBlockSize(PAGE_SIZE), estimatedSize(`0`),
172	growSize(PAGE_SIZE), maxGrowSize(maxAllocationSize), log2_grow_size_scale (`0`), bytesUsed (`0`), bytesFree (`0`), bytesWasted (`0`), atype(osAllocation ? OS_MALLOC : ALIGNED_MALLOC),
173	primrefarray (device,`0`)
174	{
175	for (size_t i=`0`; i<MAX_THREAD_USED_BLOCK_SLOTS; i++)
176	{
177	threadUsedBlocks[i] = nullptr;
178	threadBlocks[i] = nullptr;
179	assert(!slotMutex[i].isLocked());
180	}
181	}
182
183	~FastAllocator () {
184	clear();
185	}
186
187	/! returns the device attached to this allocator /
188	Device* getDevice() {
189	return device;
190	}
191
192	void share(mvector<PrimRef>& primrefarray_i) {
193	primrefarray = std::move(primrefarray_i);
194	}
195
196	void unshare(mvector<PrimRef>& primrefarray_o)
197	{
198	reset(); // this removes blocks that are allocated inside the shared primref array
199	primrefarray_o = std::move(primrefarray);
200	}
201
202	/! returns first fast thread local allocator /
203	__forceinline ThreadLocal* _threadLocal() {
204	return &threadLocal2()->alloc0;
205	}
206
207	void setOSallocation(bool flag)
208	{
209	atype = flag ? OS_MALLOC : ALIGNED_MALLOC;
210	}
211
212	private:
213
214	/! returns both fast thread local allocators /
215	__forceinline ThreadLocal2* threadLocal2()
216	{
217	ThreadLocal2* alloc = thread_local_allocator2;
218	if (alloc == nullptr) {
219	thread_local_allocator2 = alloc = new ThreadLocal2;
220	Lock<SpinLock> lock(s_thread_local_allocators_lock);
221	s_thread_local_allocators.push_back(x: make_unique(ptr: alloc));
222	}
223	return alloc;
224	}
225
226	public:
227
228	__forceinline void join(ThreadLocal2* alloc)
229	{
230	Lock<SpinLock> lock(thread_local_allocators_lock);
231	thread_local_allocators.push_back(x: alloc);
232	}
233
234	public:
235
236	struct CachedAllocator
237	{
238	__forceinline CachedAllocator(void* ptr)
239	: alloc(nullptr), talloc0(nullptr), talloc1(nullptr)
240	{
241	assert(ptr == nullptr);
242	}
243
244	__forceinline CachedAllocator(FastAllocator* alloc, ThreadLocal2* talloc)
245	: alloc(alloc), talloc0(&talloc->alloc0), talloc1(alloc->use_single_mode ? &talloc->alloc0 : &talloc->alloc1) {}
246
247	__forceinline operator bool () const {
248	return alloc != nullptr;
249	}
250
251	__forceinline void* operator() (size_t bytes, size_t align = `16`) const {
252	return talloc0->malloc(alloc,bytes,align);
253	}
254
255	__forceinline void* malloc0 (size_t bytes, size_t align = `16`) const {
256	return talloc0->malloc(alloc,bytes,align);
257	}
258
259	__forceinline void* malloc1 (size_t bytes, size_t align = `16`) const {
260	return talloc1->malloc(alloc,bytes,align);
261	}
262
263	public:
264	FastAllocator* alloc;
265	ThreadLocal* talloc0;
266	ThreadLocal* talloc1;
267	};
268
269	__forceinline CachedAllocator getCachedAllocator() {
270	return CachedAllocator (this,threadLocal2());
271	}
272
273	/! Builder interface to create thread local allocator /
274	struct Create
275	{
276	public:
277	__forceinline Create (FastAllocator* allocator) : allocator(allocator) {}
278	__forceinline CachedAllocator operator() () const { return allocator->getCachedAllocator(); }
279
280	private:
281	FastAllocator* allocator;
282	};
283
284	void internal_fix_used_blocks()
285	{
286	/ move thread local blocks to global block list /
287	for (size_t i = `0`; i < MAX_THREAD_USED_BLOCK_SLOTS; i++)
288	{
289	while (threadBlocks[i].load() != nullptr) {
290	Block* nextUsedBlock = threadBlocks[i].load()->next;
291	threadBlocks[i].load()->next = usedBlocks.load();
292	usedBlocks = threadBlocks[i].load();
293	threadBlocks[i] = nextUsedBlock;
294	}
295	threadBlocks[i] = nullptr;
296	}
297	}
298
299	static const size_t threadLocalAllocOverhead = `20`; //! 20 means 5% parallel allocation overhead through unfilled thread local blocks
300	static const size_t mainAllocOverheadStatic = `20`; //! 20 means 5% allocation overhead through unfilled main alloc blocks
301	static const size_t mainAllocOverheadDynamic = `8`; //! 20 means 12.5% allocation overhead through unfilled main alloc blocks
302
303	/ calculates a single threaded threshold for the builders such*
304	* that for small scenes the overhead of partly allocated blocks
305	* per thread is low */
306	size_t fixSingleThreadThreshold(size_t branchingFactor, size_t defaultThreshold, size_t numPrimitives, size_t bytesEstimated)
307	{
308	if (numPrimitives == `0` \|\| bytesEstimated == `0`)
309	return defaultThreshold;
310
311	/ calculate block size in bytes to fulfill threadLocalAllocOverhead constraint /
312	const size_t single_mode_factor = use_single_mode ? `1` : `2`;
313	const size_t threadCount = TaskScheduler::threadCount();
314	const size_t singleThreadBytes = single_mode_factorthreadLocalAllocOverheaddefaultBlockSize;
315
316	/ if we do not have to limit number of threads use optimal thresdhold /
317	if ( (bytesEstimated+(singleThreadBytes-`1`))/singleThreadBytes >= threadCount)
318	return defaultThreshold;
319
320	/ otherwise limit number of threads by calculating proper single thread threshold /
321	else {
322	double bytesPerPrimitive = double(bytesEstimated)/double(numPrimitives);
323	return size_t(ceil(x: branchingFactor*singleThreadBytes/bytesPerPrimitive));
324	}
325	}
326
327	__forceinline size_t alignSize(size_t i) {
328	return (i+`127`)/`128`*`128`;
329	}
330
331	/! initializes the grow size /
332	__forceinline void initGrowSizeAndNumSlots(size_t bytesEstimated, bool fast)
333	{
334	/ we do not need single thread local allocator mode /
335	use_single_mode = false;
336
337	/ calculate growSize such that at most mainAllocationOverhead gets wasted when a block stays unused /
338	size_t mainAllocOverhead = fast ? mainAllocOverheadDynamic : mainAllocOverheadStatic;
339	size_t blockSize = alignSize(i: bytesEstimated/mainAllocOverhead);
340	growSize = maxGrowSize = clamp(x: blockSize,lower: size_t(`1024`),maxAllocationSize);
341
342	/ if we reached the maxAllocationSize for growSize, we can*
343	* increase the number of allocation slots by still guaranteeing
344	* the mainAllocationOverhead */
345	slotMask = `0x0`;
346
347	if (MAX_THREAD_USED_BLOCK_SLOTS >= `2` && bytesEstimated > `2`mainAllocOverheadgrowSize) slotMask = `0x1`;
348	if (MAX_THREAD_USED_BLOCK_SLOTS >= `4` && bytesEstimated > `4`mainAllocOverheadgrowSize) slotMask = `0x3`;
349	if (MAX_THREAD_USED_BLOCK_SLOTS >= `8` && bytesEstimated > `8`mainAllocOverheadgrowSize) slotMask = `0x7`;
350	if (MAX_THREAD_USED_BLOCK_SLOTS >= `8` && bytesEstimated > `16`mainAllocOverheadgrowSize) { growSize = `2`; } /* if the overhead is tiny, double the growSize /
351
352	/ set the thread local alloc block size /
353	size_t defaultBlockSizeSwitch = PAGE_SIZE+maxAlignment;
354
355	/ for sufficiently large scene we can increase the defaultBlockSize over the defaultBlockSizeSwitch size /
356	#if 0 // we do not do this as a block size of 4160 if for some reason best for KNL
357	const size_t threadCount = TaskScheduler::threadCount();
358	const size_t single_mode_factor = use_single_mode ? `1` : `2`;
359	const size_t singleThreadBytes = single_mode_factorthreadLocalAllocOverheaddefaultBlockSizeSwitch;
360	if (bytesEstimated+(singleThreadBytes-`1`))/singleThreadBytes >= threadCount)
361	defaultBlockSize = min(max(defaultBlockSizeSwitch,bytesEstimated/(single_mode_factorthreadLocalAllocOverheadthreadCount)),growSize);
362
363	/ otherwise we grow the defaultBlockSize up to defaultBlockSizeSwitch /
364	else
365	#endif
366	defaultBlockSize = clamp(x: blockSize,lower: size_t(`1024`),upper: defaultBlockSizeSwitch);
367
368	if (bytesEstimated == `0`) {
369	maxGrowSize = maxAllocationSize; // special mode if builder cannot estimate tree size
370	defaultBlockSize = defaultBlockSizeSwitch;
371	}
372	log2_grow_size_scale = `0`;
373
374	if (device->alloc_main_block_size != `0`) growSize = device->alloc_main_block_size;
375	if (device->alloc_num_main_slots >= `1` ) slotMask = `0x0`;
376	if (device->alloc_num_main_slots >= `2` ) slotMask = `0x1`;
377	if (device->alloc_num_main_slots >= `4` ) slotMask = `0x3`;
378	if (device->alloc_num_main_slots >= `8` ) slotMask = `0x7`;
379	if (device->alloc_thread_block_size != `0`) defaultBlockSize = device->alloc_thread_block_size;
380	if (device->alloc_single_thread_alloc != -`1`) use_single_mode = device->alloc_single_thread_alloc;
381	}
382
383	/! initializes the allocator /
384	void init(size_t bytesAllocate, size_t bytesReserve, size_t bytesEstimate)
385	{
386	internal_fix_used_blocks();
387	/ distribute the allocation to multiple thread block slots /
388	slotMask = MAX_THREAD_USED_BLOCK_SLOTS-`1`; // FIXME: remove
389	if (usedBlocks.load() \|\| freeBlocks.load()) { reset(); return; }
390	if (bytesReserve == `0`) bytesReserve = bytesAllocate;
391	freeBlocks = Block::create(device,bytesAllocate,bytesReserve,next: nullptr,atype);
392	estimatedSize = bytesEstimate;
393	initGrowSizeAndNumSlots(bytesEstimated: bytesEstimate,fast: true);
394	}
395
396	/! initializes the allocator /
397	void init_estimate(size_t bytesEstimate)
398	{
399	internal_fix_used_blocks();
400	if (usedBlocks.load() \|\| freeBlocks.load()) { reset(); return; }
401	/ single allocator mode ? /
402	estimatedSize = bytesEstimate;
403	//initGrowSizeAndNumSlots(bytesEstimate,false);
404	initGrowSizeAndNumSlots(bytesEstimated: bytesEstimate,fast: false);
405
406	}
407
408	/! frees state not required after build /
409	__forceinline void cleanup()
410	{
411	internal_fix_used_blocks();
412
413	/ unbind all thread local allocators /
414	for (auto alloc : thread_local_allocators) alloc->unbind(alloc_i: this);
415	thread_local_allocators.clear();
416	}
417
418	/! resets the allocator, memory blocks get reused /
419	void reset ()
420	{
421	internal_fix_used_blocks();
422
423	bytesUsed.store(i: `0`);
424	bytesFree.store(i: `0`);
425	bytesWasted.store(i: `0`);
426
427	/ reset all used blocks and move them to begin of free block list /
428	while (usedBlocks.load() != nullptr) {
429	usedBlocks.load()->reset_block();
430	Block* nextUsedBlock = usedBlocks.load()->next;
431	usedBlocks.load()->next = freeBlocks.load();
432	freeBlocks = usedBlocks.load();
433	usedBlocks = nextUsedBlock;
434	}
435
436	/ remove all shared blocks as they are re-added during build /
437	freeBlocks.store(p: Block::remove_shared_blocks(head: freeBlocks.load()));
438
439	for (size_t i=`0`; i<MAX_THREAD_USED_BLOCK_SLOTS; i++)
440	{
441	threadUsedBlocks[i] = nullptr;
442	threadBlocks[i] = nullptr;
443	}
444
445	/ unbind all thread local allocators /
446	for (auto alloc : thread_local_allocators) alloc->unbind(alloc_i: this);
447	thread_local_allocators.clear();
448	}
449
450	/! frees all allocated memory /
451	__forceinline void clear()
452	{
453	cleanup();
454	bytesUsed.store(i: `0`);
455	bytesFree.store(i: `0`);
456	bytesWasted.store(i: `0`);
457	if (usedBlocks.load() != nullptr) usedBlocks.load()->clear_list(device); usedBlocks = nullptr;
458	if (freeBlocks.load() != nullptr) freeBlocks.load()->clear_list(device); freeBlocks = nullptr;
459	for (size_t i=`0`; i<MAX_THREAD_USED_BLOCK_SLOTS; i++) {
460	threadUsedBlocks[i] = nullptr;
461	threadBlocks[i] = nullptr;
462	}
463	primrefarray.clear();
464	}
465
466	__forceinline size_t incGrowSizeScale()
467	{
468	size_t scale = log2_grow_size_scale.fetch_add(i: `1`)+`1`;
469	return size_t(`1`) << min(a: size_t(`16`),b: scale);
470	}
471
472	/! thread safe allocation of memory /
473	void* malloc(size_t& bytes, size_t align, bool partial)
474	{
475	assert(align <= maxAlignment);
476
477	while (true)
478	{
479	/ allocate using current block /
480	size_t threadID = TaskScheduler::threadID();
481	size_t slot = threadID & slotMask;
482	Block* myUsedBlocks = threadUsedBlocks[slot];
483	if (myUsedBlocks) {
484	void* ptr = myUsedBlocks->malloc(device,bytes_in&: bytes,align,partial);
485	if (ptr) return ptr;
486	}
487
488	/ throw error if allocation is too large /
489	if (bytes > maxAllocationSize)
490	throw_RTCError(RTC_ERROR_UNKNOWN,"allocation is too large");
491
492	/ parallel block creation in case of no freeBlocks, avoids single global mutex /
493	if (likely(freeBlocks.load() == nullptr))
494	{
495	Lock<SpinLock> lock(slotMutex[slot]);
496	if (myUsedBlocks == threadUsedBlocks[slot]) {
497	const size_t alignedBytes = (bytes+(align-`1`)) & ~(align-`1`);
498	const size_t allocSize = max(a: min(a: growSize,b: maxGrowSize),b: alignedBytes);
499	assert(allocSize >= bytes);
500	threadBlocks[slot] = threadUsedBlocks[slot] = Block::create(device,bytesAllocate: allocSize,bytesReserve: allocSize,next: threadBlocks[slot],atype); // FIXME: a large allocation might throw away a block here!
501	// FIXME: a direct allocation should allocate inside the block here, and not in the next loop! a different thread could do some allocation and make the large allocation fail.
502	}
503	continue;
504	}
505
506	/ if this fails allocate new block /
507	{
508	Lock<SpinLock> lock(mutex);
509	if (myUsedBlocks == threadUsedBlocks[slot])
510	{
511	if (freeBlocks.load() != nullptr) {
512	Block* nextFreeBlock = freeBlocks.load()->next;
513	freeBlocks.load()->next = usedBlocks;
514	__memory_barrier();
515	usedBlocks = freeBlocks.load();
516	threadUsedBlocks[slot] = freeBlocks.load();
517	freeBlocks = nextFreeBlock;
518	} else {
519	const size_t allocSize = min(a: growSize*incGrowSizeScale(),b: maxGrowSize);
520	usedBlocks = threadUsedBlocks[slot] = Block::create(device,bytesAllocate: allocSize,bytesReserve: allocSize,next: usedBlocks,atype); // FIXME: a large allocation should get delivered directly, like above!
521	}
522	}
523	}
524	}
525	}
526
527	/! add new block /
528	void addBlock(void* ptr, ssize_t bytes)
529	{
530	Lock<SpinLock> lock(mutex);
531	const size_t sizeof_Header = offsetof(Block,data[`0`]);
532	void* aptr = (void*) ((((size_t)ptr)+maxAlignment-`1`) & ~(maxAlignment-`1`));
533	size_t ofs = (size_t) aptr - (size_t) ptr;
534	bytes -= ofs;
535	if (bytes < `4096`) return; // ignore empty or very small blocks
536	freeBlocks = new (aptr) Block (SHARED,bytes-sizeof_Header,bytes-sizeof_Header,freeBlocks,ofs);
537	}
538
539	/ special allocation only used from morton builder only a single time for each build /
540	void* specialAlloc(size_t bytes)
541	{
542	assert(freeBlocks.load() != nullptr && freeBlocks.load()->getBlockAllocatedBytes() >= bytes);
543	return freeBlocks.load()->ptr();
544	}
545
546	struct Statistics
547	{
548	Statistics ()
549	: bytesUsed(`0`), bytesFree(`0`), bytesWasted(`0`) {}
550
551	Statistics (size_t bytesUsed, size_t bytesFree, size_t bytesWasted)
552	: bytesUsed(bytesUsed), bytesFree(bytesFree), bytesWasted(bytesWasted) {}
553
554	Statistics (FastAllocator* alloc, AllocationType atype, bool huge_pages = false)
555	: bytesUsed(`0`), bytesFree(`0`), bytesWasted(`0`)
556	{
557	Block* usedBlocks = alloc->usedBlocks.load();
558	Block* freeBlocks = alloc->freeBlocks.load();
559	if (usedBlocks) bytesUsed += usedBlocks->getUsedBytes(atype,huge_pages);
560	if (freeBlocks) bytesFree += freeBlocks->getAllocatedBytes(atype,huge_pages);
561	if (usedBlocks) bytesFree += usedBlocks->getFreeBytes(atype,huge_pages);
562	if (freeBlocks) bytesWasted += freeBlocks->getWastedBytes(atype,huge_pages);
563	if (usedBlocks) bytesWasted += usedBlocks->getWastedBytes(atype,huge_pages);
564	}
565
566	std::string str(size_t numPrimitives)
567	{
568	std::stringstream str;
569	str.setf(fmtfl: std::ios::fixed, mask: std::ios::floatfield);
570	str << "used = " << std::setw(`7`) << std::setprecision(`3`) << `1E-6f`*bytesUsed << " MB, "
571	<< "free = " << std::setw(`7`) << std::setprecision(`3`) << `1E-6f`*bytesFree << " MB, "
572	<< "wasted = " << std::setw(`7`) << std::setprecision(`3`) << `1E-6f`*bytesWasted << " MB, "
573	<< "total = " << std::setw(`7`) << std::setprecision(`3`) << `1E-6f`*bytesAllocatedTotal() << " MB, "
574	<< "#bytes/prim = " << std::setw(`6`) << std::setprecision(`2`) << double(bytesAllocatedTotal())/double(numPrimitives);
575	return str.str();
576	}
577
578	friend Statistics operator+ ( const Statistics& a, const Statistics& b)
579	{
580	return Statistics (a.bytesUsed+b.bytesUsed,
581	a.bytesFree+b.bytesFree,
582	a.bytesWasted+b.bytesWasted);
583	}
584
585	size_t bytesAllocatedTotal() const {
586	return bytesUsed + bytesFree + bytesWasted;
587	}
588
589	public:
590	size_t bytesUsed;
591	size_t bytesFree;
592	size_t bytesWasted;
593	};
594
595	Statistics getStatistics(AllocationType atype, bool huge_pages = false) {
596	return Statistics (this,atype,huge_pages);
597	}
598
599	size_t getUsedBytes() {
600	return bytesUsed;
601	}
602
603	size_t getWastedBytes() {
604	return bytesWasted;
605	}
606
607	struct AllStatistics
608	{
609	AllStatistics (FastAllocator* alloc)
610
611	: bytesUsed(alloc->bytesUsed),
612	bytesFree(alloc->bytesFree),
613	bytesWasted(alloc->bytesWasted),
614	stat_all (alloc,ANY_TYPE),
615	stat_malloc (alloc,ALIGNED_MALLOC),
616	stat_4K (alloc,OS_MALLOC,false),
617	stat_2M (alloc,OS_MALLOC,true),
618	stat_shared (alloc,SHARED) {}
619
620	AllStatistics (size_t bytesUsed,
621	size_t bytesFree,
622	size_t bytesWasted,
623	Statistics stat_all,
624	Statistics stat_malloc,
625	Statistics stat_4K,
626	Statistics stat_2M,
627	Statistics stat_shared)
628
629	: bytesUsed(bytesUsed),
630	bytesFree(bytesFree),
631	bytesWasted(bytesWasted),
632	stat_all (stat_all),
633	stat_malloc (stat_malloc),
634	stat_4K (stat_4K),
635	stat_2M (stat_2M),
636	stat_shared (stat_shared) {}
637
638	friend AllStatistics operator+ (const AllStatistics& a, const AllStatistics& b)
639	{
640	return AllStatistics (a.bytesUsed+b.bytesUsed,
641	a.bytesFree+b.bytesFree,
642	a.bytesWasted+b.bytesWasted,
643	a.stat_all + b.stat_all,
644	a.stat_malloc + b.stat_malloc,
645	a.stat_4K + b.stat_4K,
646	a.stat_2M + b.stat_2M,
647	a.stat_shared + b.stat_shared);
648	}
649
650	void print(size_t numPrimitives)
651	{
652	std::stringstream str0;
653	str0.setf(fmtfl: std::ios::fixed, mask: std::ios::floatfield);
654	str0 << " alloc : "
655	<< "used = " << std::setw(`7`) << std::setprecision(`3`) << `1E-6f`*bytesUsed << " MB, "
656	<< " "
657	<< "#bytes/prim = " << std::setw(`6`) << std::setprecision(`2`) << double(bytesUsed)/double(numPrimitives);
658	std::cout << str0.str() << std::endl;
659
660	std::stringstream str1;
661	str1.setf(fmtfl: std::ios::fixed, mask: std::ios::floatfield);
662	str1 << " alloc : "
663	<< "used = " << std::setw(`7`) << std::setprecision(`3`) << `1E-6f`*bytesUsed << " MB, "
664	<< "free = " << std::setw(`7`) << std::setprecision(`3`) << `1E-6f`*bytesFree << " MB, "
665	<< "wasted = " << std::setw(`7`) << std::setprecision(`3`) << `1E-6f`*bytesWasted << " MB, "
666	<< "total = " << std::setw(`7`) << std::setprecision(`3`) << `1E-6f`*(bytesUsed+bytesFree+bytesWasted) << " MB, "
667	<< "#bytes/prim = " << std::setw(`6`) << std::setprecision(`2`) << double(bytesUsed+bytesFree+bytesWasted)/double(numPrimitives);
668	std::cout << str1.str() << std::endl;
669
670	std::cout << " total : " << stat_all.str(numPrimitives) << std::endl;
671	std::cout << " 4K : " << stat_4K.str(numPrimitives) << std::endl;
672	std::cout << " 2M : " << stat_2M.str(numPrimitives) << std::endl;
673	std::cout << " malloc: " << stat_malloc.str(numPrimitives) << std::endl;
674	std::cout << " shared: " << stat_shared.str(numPrimitives) << std::endl;
675	}
676
677	private:
678	size_t bytesUsed;
679	size_t bytesFree;
680	size_t bytesWasted;
681	Statistics stat_all;
682	Statistics stat_malloc;
683	Statistics stat_4K;
684	Statistics stat_2M;
685	Statistics stat_shared;
686	};
687
688	void print_blocks()
689	{
690	std::cout << " estimatedSize = " << estimatedSize << ", slotMask = " << slotMask << ", use_single_mode = " << use_single_mode << ", maxGrowSize = " << maxGrowSize << ", defaultBlockSize = " << defaultBlockSize << std::endl;
691
692	std::cout << " used blocks = ";
693	if (usedBlocks.load() != nullptr) usedBlocks.load()->print_list();
694	std::cout << "[END]" << std::endl;
695
696	std::cout << " free blocks = ";
697	if (freeBlocks.load() != nullptr) freeBlocks.load()->print_list();
698	std::cout << "[END]" << std::endl;
699	}
700
701	private:
702
703	struct Block
704	{
705	static Block* create(MemoryMonitorInterface* device, size_t bytesAllocate, size_t bytesReserve, Block* next, AllocationType atype)
706	{
707	/ We avoid using os_malloc for small blocks as this could*
708	* cause a risk of fragmenting the virtual address space and
709	* reach the limit of vm.max_map_count = 65k under Linux. */
710	if (atype == OS_MALLOC && bytesAllocate < maxAllocationSize)
711	atype = ALIGNED_MALLOC;
712
713	/ we need to additionally allocate some header /
714	const size_t sizeof_Header = offsetof(Block,data[`0`]);
715	bytesAllocate = sizeof_Header+bytesAllocate;
716	bytesReserve = sizeof_Header+bytesReserve;
717
718	/ consume full 4k pages with using os_malloc /
719	if (atype == OS_MALLOC) {
720	bytesAllocate = ((bytesAllocate+PAGE_SIZE-`1`) & ~(PAGE_SIZE-`1`));
721	bytesReserve = ((bytesReserve +PAGE_SIZE-`1`) & ~(PAGE_SIZE-`1`));
722	}
723
724	/ either use alignedMalloc or os_malloc /
725	void ptr = nullptr*;
726	if (atype == ALIGNED_MALLOC)
727	{
728	/ special handling for default block size /
729	if (bytesAllocate == (`2`*PAGE_SIZE_2M))
730	{
731	const size_t alignment = maxAlignment;
732	if (device) device->memoryMonitor(bytes: bytesAllocate+alignment,post: false);
733	ptr = alignedMalloc(size: bytesAllocate,align: alignment);
734
735	/ give hint to transparently convert these pages to 2MB pages /
736	const size_t ptr_aligned_begin = ((size_t)ptr) & ~size_t(PAGE_SIZE_2M-`1`);
737	os_advise(ptr: (void)(ptr_aligned_begin + `0`),PAGE_SIZE_2M); // may fail if no memory mapped before block*
738	os_advise(ptr: (void)(ptr_aligned_begin + `1`PAGE_SIZE_2M),PAGE_SIZE_2M);
739	os_advise(ptr: (void)(ptr_aligned_begin + `2`PAGE_SIZE_2M),PAGE_SIZE_2M); // may fail if no memory mapped after block
740
741	return new (ptr) Block (ALIGNED_MALLOC,bytesAllocate-sizeof_Header,bytesAllocate-sizeof_Header,next,alignment);
742	}
743	else
744	{
745	const size_t alignment = maxAlignment;
746	if (device) device->memoryMonitor(bytes: bytesAllocate+alignment,post: false);
747	ptr = alignedMalloc(size: bytesAllocate,align: alignment);
748	return new (ptr) Block (ALIGNED_MALLOC,bytesAllocate-sizeof_Header,bytesAllocate-sizeof_Header,next,alignment);
749	}
750	}
751	else if (atype == OS_MALLOC)
752	{
753	if (device) device->memoryMonitor(bytes: bytesAllocate,post: false);
754	bool huge_pages; ptr = os_malloc(bytes: bytesReserve,hugepages&: huge_pages);
755	return new (ptr) Block (OS_MALLOC,bytesAllocate-sizeof_Header,bytesReserve-sizeof_Header,next,`0`,huge_pages);
756	}
757	else
758	assert(false);
759
760	return NULL;
761	}
762
763	Block (AllocationType atype, size_t bytesAllocate, size_t bytesReserve, Block* next, size_t wasted, bool huge_pages = false)
764	: cur (`0`), allocEnd (bytesAllocate), reserveEnd (bytesReserve), next(next), wasted(wasted), atype(atype), huge_pages(huge_pages)
765	{
766	assert((((size_t)&data[`0`]) & (maxAlignment-`1`)) == `0`);
767	}
768
769	static Block* remove_shared_blocks(Block* head)
770	{
771	Block** prev_next = &head;
772	for (Block* block = head; block; block = block->next) {
773	if (block->atype == SHARED) *prev_next = block->next;
774	else prev_next = &block->next;
775	}
776	return head;
777	}
778
779	void clear_list(MemoryMonitorInterface* device)
780	{
781	Block* block = this;
782	while (block) {
783	Block* next = block->next;
784	block->clear_block(device);
785	block = next;
786	}
787	}
788
789	void clear_block (MemoryMonitorInterface* device)
790	{
791	const size_t sizeof_Header = offsetof(Block,data[`0`]);
792	const ssize_t sizeof_Alloced = wasted+sizeof_Header+getBlockAllocatedBytes();
793
794	if (atype == ALIGNED_MALLOC) {
795	alignedFree(ptr: this);
796	if (device) device->memoryMonitor(bytes: -sizeof_Alloced,post: true);
797	}
798
799	else if (atype == OS_MALLOC) {
800	size_t sizeof_This = sizeof_Header+reserveEnd;
801	os_free(ptr: this,bytes: sizeof_This,hugepages: huge_pages);
802	if (device) device->memoryMonitor(bytes: -sizeof_Alloced,post: true);
803	}
804
805	else / if (atype == SHARED) / {
806	}
807	}
808
809	void* malloc(MemoryMonitorInterface* device, size_t& bytes_in, size_t align, bool partial)
810	{
811	size_t bytes = bytes_in;
812	assert(align <= maxAlignment);
813	bytes = (bytes+(align-`1`)) & ~(align-`1`);
814	if (unlikely(cur+bytes > reserveEnd && !partial)) return nullptr;
815	const size_t i = cur.fetch_add(i: bytes);
816	if (unlikely(i+bytes > reserveEnd && !partial)) return nullptr;
817	if (unlikely(i > reserveEnd)) return nullptr;
818	bytes_in = bytes = min(a: bytes,b: reserveEnd-i);
819
820	if (i+bytes > allocEnd) {
821	if (device) device->memoryMonitor(bytes: i+bytes-max(a: i,b: allocEnd),post: true);
822	}
823	return &data[i];
824	}
825
826	void* ptr() {
827	return &data[cur];
828	}
829
830	void reset_block ()
831	{
832	allocEnd = max(a: allocEnd,b: (size_t)cur);
833	cur = `0`;
834	}
835
836	size_t getBlockUsedBytes() const {
837	return min(a: size_t(cur),b: reserveEnd);
838	}
839
840	size_t getBlockFreeBytes() const {
841	return getBlockAllocatedBytes() - getBlockUsedBytes();
842	}
843
844	size_t getBlockAllocatedBytes() const {
845	return min(a: max(a: allocEnd,b: size_t(cur)),b: reserveEnd);
846	}
847
848	size_t getBlockWastedBytes() const {
849	const size_t sizeof_Header = offsetof(Block,data[`0`]);
850	return sizeof_Header + wasted;
851	}
852
853	size_t getBlockReservedBytes() const {
854	return reserveEnd;
855	}
856
857	bool hasType(AllocationType atype_i, bool huge_pages_i) const
858	{
859	if (atype_i == ANY_TYPE ) return true;
860	else if (atype == OS_MALLOC) return atype_i == atype && huge_pages_i == huge_pages;
861	else return atype_i == atype;
862	}
863
864	size_t getUsedBytes(AllocationType atype, bool huge_pages = false) const {
865	size_t bytes = `0`;
866	for (const Block* block = this; block; block = block->next) {
867	if (!block->hasType(atype_i: atype,huge_pages_i: huge_pages)) continue;
868	bytes += block->getBlockUsedBytes();
869	}
870	return bytes;
871	}
872
873	size_t getFreeBytes(AllocationType atype, bool huge_pages = false) const {
874	size_t bytes = `0`;
875	for (const Block* block = this; block; block = block->next) {
876	if (!block->hasType(atype_i: atype,huge_pages_i: huge_pages)) continue;
877	bytes += block->getBlockFreeBytes();
878	}
879	return bytes;
880	}
881
882	size_t getWastedBytes(AllocationType atype, bool huge_pages = false) const {
883	size_t bytes = `0`;
884	for (const Block* block = this; block; block = block->next) {
885	if (!block->hasType(atype_i: atype,huge_pages_i: huge_pages)) continue;
886	bytes += block->getBlockWastedBytes();
887	}
888	return bytes;
889	}
890
891	size_t getAllocatedBytes(AllocationType atype, bool huge_pages = false) const {
892	size_t bytes = `0`;
893	for (const Block* block = this; block; block = block->next) {
894	if (!block->hasType(atype_i: atype,huge_pages_i: huge_pages)) continue;
895	bytes += block->getBlockAllocatedBytes();
896	}
897	return bytes;
898	}
899
900	void print_list ()
901	{
902	for (const Block* block = this; block; block = block->next)
903	block->print_block();
904	}
905
906	void print_block() const
907	{
908	if (atype == ALIGNED_MALLOC) std::cout << "A";
909	else if (atype == OS_MALLOC) std::cout << "O";
910	else if (atype == SHARED) std::cout << "S";
911	if (huge_pages) std::cout << "H";
912	size_t bytesUsed = getBlockUsedBytes();
913	size_t bytesFree = getBlockFreeBytes();
914	size_t bytesWasted = getBlockWastedBytes();
915	std::cout << "[" << bytesUsed << ", " << bytesFree << ", " << bytesWasted << "] ";
916	}
917
918	public:
919	std::atomic<size_t> cur; //!< current location of the allocator
920	std::atomic<size_t> allocEnd; //!< end of the allocated memory region
921	std::atomic<size_t> reserveEnd; //!< end of the reserved memory region
922	Block* next; //!< pointer to next block in list
923	size_t wasted; //!< amount of memory wasted through block alignment
924	AllocationType atype; //!< allocation mode of the block
925	bool huge_pages; //!< whether the block uses huge pages
926	char align[maxAlignment-`5`*sizeof(size_t)-sizeof(AllocationType)-sizeof(bool)]; //!< align data to maxAlignment
927	char data[`1`]; //!< here starts memory to use for allocations
928	};
929
930	private:
931	Device* device;
932	SpinLock mutex;
933	size_t slotMask;
934	std::atomic<Block*> threadUsedBlocks[MAX_THREAD_USED_BLOCK_SLOTS];
935	std::atomic<Block*> usedBlocks;
936	std::atomic<Block*> freeBlocks;
937
938	std::atomic<Block*> threadBlocks[MAX_THREAD_USED_BLOCK_SLOTS];
939	SpinLock slotMutex[MAX_THREAD_USED_BLOCK_SLOTS];
940
941	bool use_single_mode;
942	size_t defaultBlockSize;
943	size_t estimatedSize;
944	size_t growSize;
945	size_t maxGrowSize;
946	std::atomic<size_t> log2_grow_size_scale; //!< log2 of scaling factor for grow size // FIXME: remove
947	std::atomic<size_t> bytesUsed;
948	std::atomic<size_t> bytesFree;
949	std::atomic<size_t> bytesWasted;
950	static __thread ThreadLocal2* thread_local_allocator2;
951	static SpinLock s_thread_local_allocators_lock;
952	static std::vector<std::unique_ptr<ThreadLocal2>> s_thread_local_allocators;
953	SpinLock thread_local_allocators_lock;
954	std::vector<ThreadLocal2*> thread_local_allocators;
955	AllocationType atype;
956	mvector<PrimRef> primrefarray; //!< primrefarray used to allocate nodes
957	};
958	}
959

Provided by KDAB

Definitions

FastAllocator
AllocationType
ThreadLocal
ThreadLocal
init
malloc
getUsedBytes
getFreeBytes
getWastedBytes
ThreadLocal2
ThreadLocal2
bind
unbind
FastAllocator
~FastAllocator
getDevice
share
unshare
_threadLocal
setOSallocation
threadLocal2
join
CachedAllocator
CachedAllocator
CachedAllocator
operator bool
operator()
malloc0
malloc1
getCachedAllocator
Create
Create
operator()
internal_fix_used_blocks
fixSingleThreadThreshold
alignSize
initGrowSizeAndNumSlots
init
init_estimate
cleanup
reset
clear
incGrowSizeScale
malloc
addBlock
specialAlloc
Statistics
Statistics
Statistics
Statistics
str
operator+
bytesAllocatedTotal
getStatistics
getUsedBytes
getWastedBytes
AllStatistics
AllStatistics
AllStatistics
operator+
print
print_blocks
Block
create
Block
remove_shared_blocks
clear_list
clear_block
malloc
ptr
reset_block
getBlockUsedBytes
getBlockFreeBytes
getBlockAllocatedBytes
getBlockWastedBytes
getBlockReservedBytes
hasType
getUsedBytes
getFreeBytes
getWastedBytes
getAllocatedBytes
print_list

Start learning QML with our Intro Training

Find out more

Definitions

source code of qtquick3d/src/3rdparty/embree/kernels/common/alloc.h