1 | // Copyright 2009-2021 Intel Corporation |
2 | // SPDX-License-Identifier: Apache-2.0 |
3 | |
4 | #pragma once |
5 | |
6 | #include "default.h" |
7 | #include "device.h" |
8 | #include "scene.h" |
9 | #include "primref.h" |
10 | |
11 | namespace embree |
12 | { |
13 | class FastAllocator |
14 | { |
15 | /*! maximum supported alignment */ |
16 | static const size_t maxAlignment = 64; |
17 | |
18 | /*! maximum allocation size */ |
19 | |
20 | /* default settings */ |
21 | //static const size_t defaultBlockSize = 4096; |
22 | #define maxAllocationSize size_t(2*1024*1024-maxAlignment) |
23 | |
24 | static const size_t MAX_THREAD_USED_BLOCK_SLOTS = 8; |
25 | |
26 | public: |
27 | |
28 | struct ThreadLocal2; |
29 | enum AllocationType { ALIGNED_MALLOC, OS_MALLOC, SHARED, ANY_TYPE }; |
30 | |
31 | /*! Per thread structure holding the current memory block. */ |
32 | struct __aligned(64) ThreadLocal |
33 | { |
34 | ALIGNED_CLASS_(64); |
35 | public: |
36 | |
37 | /*! Constructor for usage with ThreadLocalData */ |
38 | __forceinline ThreadLocal (ThreadLocal2* parent) |
39 | : parent(parent), ptr(nullptr), cur(0), end(0), allocBlockSize(0), bytesUsed(0), bytesWasted(0) {} |
40 | |
41 | /*! initialize allocator */ |
42 | void init(FastAllocator* alloc) |
43 | { |
44 | ptr = nullptr; |
45 | cur = end = 0; |
46 | bytesUsed = 0; |
47 | bytesWasted = 0; |
48 | allocBlockSize = 0; |
49 | if (alloc) allocBlockSize = alloc->defaultBlockSize; |
50 | } |
51 | |
52 | /* Allocate aligned memory from the threads memory block. */ |
53 | __forceinline void* malloc(FastAllocator* alloc, size_t bytes, size_t align = 16) |
54 | { |
55 | /* bind the thread local allocator to the proper FastAllocator*/ |
56 | parent->bind(alloc_i: alloc); |
57 | |
58 | assert(align <= maxAlignment); |
59 | bytesUsed += bytes; |
60 | |
61 | /* try to allocate in local block */ |
62 | size_t ofs = (align - cur) & (align-1); |
63 | cur += bytes + ofs; |
64 | if (likely(cur <= end)) { bytesWasted += ofs; return &ptr[cur - bytes]; } |
65 | cur -= bytes + ofs; |
66 | |
67 | /* if allocation is too large allocate with parent allocator */ |
68 | if (4*bytes > allocBlockSize) { |
69 | return alloc->malloc(bytes,align: maxAlignment,partial: false); |
70 | } |
71 | |
72 | /* get new partial block if allocation failed */ |
73 | size_t blockSize = allocBlockSize; |
74 | ptr = (char*) alloc->malloc(bytes&: blockSize,align: maxAlignment,partial: true); |
75 | bytesWasted += end-cur; |
76 | cur = 0; end = blockSize; |
77 | |
78 | /* retry allocation */ |
79 | ofs = (align - cur) & (align-1); |
80 | cur += bytes + ofs; |
81 | if (likely(cur <= end)) { bytesWasted += ofs; return &ptr[cur - bytes]; } |
82 | cur -= bytes + ofs; |
83 | |
84 | /* get new full block if allocation failed */ |
85 | blockSize = allocBlockSize; |
86 | ptr = (char*) alloc->malloc(bytes&: blockSize,align: maxAlignment,partial: false); |
87 | bytesWasted += end-cur; |
88 | cur = 0; end = blockSize; |
89 | |
90 | /* retry allocation */ |
91 | ofs = (align - cur) & (align-1); |
92 | cur += bytes + ofs; |
93 | if (likely(cur <= end)) { bytesWasted += ofs; return &ptr[cur - bytes]; } |
94 | cur -= bytes + ofs; |
95 | |
96 | /* should never happen as large allocations get handled specially above */ |
97 | assert(false); |
98 | return nullptr; |
99 | } |
100 | |
101 | |
102 | /*! returns amount of used bytes */ |
103 | __forceinline size_t getUsedBytes() const { return bytesUsed; } |
104 | |
105 | /*! returns amount of free bytes */ |
106 | __forceinline size_t getFreeBytes() const { return end-cur; } |
107 | |
108 | /*! returns amount of wasted bytes */ |
109 | __forceinline size_t getWastedBytes() const { return bytesWasted; } |
110 | |
111 | private: |
112 | ThreadLocal2* parent; |
113 | char* ptr; //!< pointer to memory block |
114 | size_t cur; //!< current location of the allocator |
115 | size_t end; //!< end of the memory block |
116 | size_t allocBlockSize; //!< block size for allocations |
117 | size_t bytesUsed; //!< number of total bytes allocated |
118 | size_t bytesWasted; //!< number of bytes wasted |
119 | }; |
120 | |
121 | /*! Two thread local structures. */ |
122 | struct __aligned(64) ThreadLocal2 |
123 | { |
124 | ALIGNED_CLASS_(64); |
125 | public: |
126 | |
127 | __forceinline ThreadLocal2() |
128 | : alloc(nullptr), alloc0(this), alloc1(this) {} |
129 | |
130 | /*! bind to fast allocator */ |
131 | __forceinline void bind(FastAllocator* alloc_i) |
132 | { |
133 | assert(alloc_i); |
134 | if (alloc.load() == alloc_i) return; |
135 | Lock<SpinLock> lock(mutex); |
136 | //if (alloc.load() == alloc_i) return; // not required as only one thread calls bind |
137 | if (alloc.load()) { |
138 | alloc.load()->bytesUsed += alloc0.getUsedBytes() + alloc1.getUsedBytes(); |
139 | alloc.load()->bytesFree += alloc0.getFreeBytes() + alloc1.getFreeBytes(); |
140 | alloc.load()->bytesWasted += alloc0.getWastedBytes() + alloc1.getWastedBytes(); |
141 | } |
142 | alloc0.init(alloc: alloc_i); |
143 | alloc1.init(alloc: alloc_i); |
144 | alloc.store(p: alloc_i); |
145 | alloc_i->join(alloc: this); |
146 | } |
147 | |
148 | /*! unbind to fast allocator */ |
149 | void unbind(FastAllocator* alloc_i) |
150 | { |
151 | assert(alloc_i); |
152 | if (alloc.load() != alloc_i) return; |
153 | Lock<SpinLock> lock(mutex); |
154 | if (alloc.load() != alloc_i) return; // required as a different thread calls unbind |
155 | alloc.load()->bytesUsed += alloc0.getUsedBytes() + alloc1.getUsedBytes(); |
156 | alloc.load()->bytesFree += alloc0.getFreeBytes() + alloc1.getFreeBytes(); |
157 | alloc.load()->bytesWasted += alloc0.getWastedBytes() + alloc1.getWastedBytes(); |
158 | alloc0.init(alloc: nullptr); |
159 | alloc1.init(alloc: nullptr); |
160 | alloc.store(p: nullptr); |
161 | } |
162 | |
163 | public: |
164 | SpinLock mutex; //!< required as unbind is called from other threads |
165 | std::atomic<FastAllocator*> alloc; //!< parent allocator |
166 | ThreadLocal alloc0; |
167 | ThreadLocal alloc1; |
168 | }; |
169 | |
170 | FastAllocator (Device* device, bool osAllocation) |
171 | : device(device), slotMask(0), usedBlocks(nullptr), freeBlocks(nullptr), use_single_mode(false), defaultBlockSize(PAGE_SIZE), estimatedSize(0), |
172 | growSize(PAGE_SIZE), maxGrowSize(maxAllocationSize), log2_grow_size_scale(0), bytesUsed(0), bytesFree(0), bytesWasted(0), atype(osAllocation ? OS_MALLOC : ALIGNED_MALLOC), |
173 | primrefarray(device,0) |
174 | { |
175 | for (size_t i=0; i<MAX_THREAD_USED_BLOCK_SLOTS; i++) |
176 | { |
177 | threadUsedBlocks[i] = nullptr; |
178 | threadBlocks[i] = nullptr; |
179 | assert(!slotMutex[i].isLocked()); |
180 | } |
181 | } |
182 | |
183 | ~FastAllocator () { |
184 | clear(); |
185 | } |
186 | |
187 | /*! returns the device attached to this allocator */ |
188 | Device* getDevice() { |
189 | return device; |
190 | } |
191 | |
192 | void share(mvector<PrimRef>& primrefarray_i) { |
193 | primrefarray = std::move(primrefarray_i); |
194 | } |
195 | |
196 | void unshare(mvector<PrimRef>& primrefarray_o) |
197 | { |
198 | reset(); // this removes blocks that are allocated inside the shared primref array |
199 | primrefarray_o = std::move(primrefarray); |
200 | } |
201 | |
202 | /*! returns first fast thread local allocator */ |
203 | __forceinline ThreadLocal* _threadLocal() { |
204 | return &threadLocal2()->alloc0; |
205 | } |
206 | |
207 | void setOSallocation(bool flag) |
208 | { |
209 | atype = flag ? OS_MALLOC : ALIGNED_MALLOC; |
210 | } |
211 | |
212 | private: |
213 | |
214 | /*! returns both fast thread local allocators */ |
215 | __forceinline ThreadLocal2* threadLocal2() |
216 | { |
217 | ThreadLocal2* alloc = thread_local_allocator2; |
218 | if (alloc == nullptr) { |
219 | thread_local_allocator2 = alloc = new ThreadLocal2; |
220 | Lock<SpinLock> lock(s_thread_local_allocators_lock); |
221 | s_thread_local_allocators.push_back(x: make_unique(ptr: alloc)); |
222 | } |
223 | return alloc; |
224 | } |
225 | |
226 | public: |
227 | |
228 | __forceinline void join(ThreadLocal2* alloc) |
229 | { |
230 | Lock<SpinLock> lock(thread_local_allocators_lock); |
231 | thread_local_allocators.push_back(x: alloc); |
232 | } |
233 | |
234 | public: |
235 | |
236 | struct CachedAllocator |
237 | { |
238 | __forceinline CachedAllocator(void* ptr) |
239 | : alloc(nullptr), talloc0(nullptr), talloc1(nullptr) |
240 | { |
241 | assert(ptr == nullptr); |
242 | } |
243 | |
244 | __forceinline CachedAllocator(FastAllocator* alloc, ThreadLocal2* talloc) |
245 | : alloc(alloc), talloc0(&talloc->alloc0), talloc1(alloc->use_single_mode ? &talloc->alloc0 : &talloc->alloc1) {} |
246 | |
247 | __forceinline operator bool () const { |
248 | return alloc != nullptr; |
249 | } |
250 | |
251 | __forceinline void* operator() (size_t bytes, size_t align = 16) const { |
252 | return talloc0->malloc(alloc,bytes,align); |
253 | } |
254 | |
255 | __forceinline void* malloc0 (size_t bytes, size_t align = 16) const { |
256 | return talloc0->malloc(alloc,bytes,align); |
257 | } |
258 | |
259 | __forceinline void* malloc1 (size_t bytes, size_t align = 16) const { |
260 | return talloc1->malloc(alloc,bytes,align); |
261 | } |
262 | |
263 | public: |
264 | FastAllocator* alloc; |
265 | ThreadLocal* talloc0; |
266 | ThreadLocal* talloc1; |
267 | }; |
268 | |
269 | __forceinline CachedAllocator getCachedAllocator() { |
270 | return CachedAllocator(this,threadLocal2()); |
271 | } |
272 | |
273 | /*! Builder interface to create thread local allocator */ |
274 | struct Create |
275 | { |
276 | public: |
277 | __forceinline Create (FastAllocator* allocator) : allocator(allocator) {} |
278 | __forceinline CachedAllocator operator() () const { return allocator->getCachedAllocator(); } |
279 | |
280 | private: |
281 | FastAllocator* allocator; |
282 | }; |
283 | |
284 | void internal_fix_used_blocks() |
285 | { |
286 | /* move thread local blocks to global block list */ |
287 | for (size_t i = 0; i < MAX_THREAD_USED_BLOCK_SLOTS; i++) |
288 | { |
289 | while (threadBlocks[i].load() != nullptr) { |
290 | Block* nextUsedBlock = threadBlocks[i].load()->next; |
291 | threadBlocks[i].load()->next = usedBlocks.load(); |
292 | usedBlocks = threadBlocks[i].load(); |
293 | threadBlocks[i] = nextUsedBlock; |
294 | } |
295 | threadBlocks[i] = nullptr; |
296 | } |
297 | } |
298 | |
299 | static const size_t threadLocalAllocOverhead = 20; //! 20 means 5% parallel allocation overhead through unfilled thread local blocks |
300 | static const size_t mainAllocOverheadStatic = 20; //! 20 means 5% allocation overhead through unfilled main alloc blocks |
301 | static const size_t mainAllocOverheadDynamic = 8; //! 20 means 12.5% allocation overhead through unfilled main alloc blocks |
302 | |
303 | /* calculates a single threaded threshold for the builders such |
304 | * that for small scenes the overhead of partly allocated blocks |
305 | * per thread is low */ |
306 | size_t fixSingleThreadThreshold(size_t branchingFactor, size_t defaultThreshold, size_t numPrimitives, size_t bytesEstimated) |
307 | { |
308 | if (numPrimitives == 0 || bytesEstimated == 0) |
309 | return defaultThreshold; |
310 | |
311 | /* calculate block size in bytes to fulfill threadLocalAllocOverhead constraint */ |
312 | const size_t single_mode_factor = use_single_mode ? 1 : 2; |
313 | const size_t threadCount = TaskScheduler::threadCount(); |
314 | const size_t singleThreadBytes = single_mode_factor*threadLocalAllocOverhead*defaultBlockSize; |
315 | |
316 | /* if we do not have to limit number of threads use optimal thresdhold */ |
317 | if ( (bytesEstimated+(singleThreadBytes-1))/singleThreadBytes >= threadCount) |
318 | return defaultThreshold; |
319 | |
320 | /* otherwise limit number of threads by calculating proper single thread threshold */ |
321 | else { |
322 | double bytesPerPrimitive = double(bytesEstimated)/double(numPrimitives); |
323 | return size_t(ceil(x: branchingFactor*singleThreadBytes/bytesPerPrimitive)); |
324 | } |
325 | } |
326 | |
327 | __forceinline size_t alignSize(size_t i) { |
328 | return (i+127)/128*128; |
329 | } |
330 | |
331 | /*! initializes the grow size */ |
332 | __forceinline void initGrowSizeAndNumSlots(size_t bytesEstimated, bool fast) |
333 | { |
334 | /* we do not need single thread local allocator mode */ |
335 | use_single_mode = false; |
336 | |
337 | /* calculate growSize such that at most mainAllocationOverhead gets wasted when a block stays unused */ |
338 | size_t mainAllocOverhead = fast ? mainAllocOverheadDynamic : mainAllocOverheadStatic; |
339 | size_t blockSize = alignSize(i: bytesEstimated/mainAllocOverhead); |
340 | growSize = maxGrowSize = clamp(x: blockSize,lower: size_t(1024),maxAllocationSize); |
341 | |
342 | /* if we reached the maxAllocationSize for growSize, we can |
343 | * increase the number of allocation slots by still guaranteeing |
344 | * the mainAllocationOverhead */ |
345 | slotMask = 0x0; |
346 | |
347 | if (MAX_THREAD_USED_BLOCK_SLOTS >= 2 && bytesEstimated > 2*mainAllocOverhead*growSize) slotMask = 0x1; |
348 | if (MAX_THREAD_USED_BLOCK_SLOTS >= 4 && bytesEstimated > 4*mainAllocOverhead*growSize) slotMask = 0x3; |
349 | if (MAX_THREAD_USED_BLOCK_SLOTS >= 8 && bytesEstimated > 8*mainAllocOverhead*growSize) slotMask = 0x7; |
350 | if (MAX_THREAD_USED_BLOCK_SLOTS >= 8 && bytesEstimated > 16*mainAllocOverhead*growSize) { growSize *= 2; } /* if the overhead is tiny, double the growSize */ |
351 | |
352 | /* set the thread local alloc block size */ |
353 | size_t defaultBlockSizeSwitch = PAGE_SIZE+maxAlignment; |
354 | |
355 | /* for sufficiently large scene we can increase the defaultBlockSize over the defaultBlockSizeSwitch size */ |
356 | #if 0 // we do not do this as a block size of 4160 if for some reason best for KNL |
357 | const size_t threadCount = TaskScheduler::threadCount(); |
358 | const size_t single_mode_factor = use_single_mode ? 1 : 2; |
359 | const size_t singleThreadBytes = single_mode_factor*threadLocalAllocOverhead*defaultBlockSizeSwitch; |
360 | if (bytesEstimated+(singleThreadBytes-1))/singleThreadBytes >= threadCount) |
361 | defaultBlockSize = min(max(defaultBlockSizeSwitch,bytesEstimated/(single_mode_factor*threadLocalAllocOverhead*threadCount)),growSize); |
362 | |
363 | /* otherwise we grow the defaultBlockSize up to defaultBlockSizeSwitch */ |
364 | else |
365 | #endif |
366 | defaultBlockSize = clamp(x: blockSize,lower: size_t(1024),upper: defaultBlockSizeSwitch); |
367 | |
368 | if (bytesEstimated == 0) { |
369 | maxGrowSize = maxAllocationSize; // special mode if builder cannot estimate tree size |
370 | defaultBlockSize = defaultBlockSizeSwitch; |
371 | } |
372 | log2_grow_size_scale = 0; |
373 | |
374 | if (device->alloc_main_block_size != 0) growSize = device->alloc_main_block_size; |
375 | if (device->alloc_num_main_slots >= 1 ) slotMask = 0x0; |
376 | if (device->alloc_num_main_slots >= 2 ) slotMask = 0x1; |
377 | if (device->alloc_num_main_slots >= 4 ) slotMask = 0x3; |
378 | if (device->alloc_num_main_slots >= 8 ) slotMask = 0x7; |
379 | if (device->alloc_thread_block_size != 0) defaultBlockSize = device->alloc_thread_block_size; |
380 | if (device->alloc_single_thread_alloc != -1) use_single_mode = device->alloc_single_thread_alloc; |
381 | } |
382 | |
383 | /*! initializes the allocator */ |
384 | void init(size_t bytesAllocate, size_t bytesReserve, size_t bytesEstimate) |
385 | { |
386 | internal_fix_used_blocks(); |
387 | /* distribute the allocation to multiple thread block slots */ |
388 | slotMask = MAX_THREAD_USED_BLOCK_SLOTS-1; // FIXME: remove |
389 | if (usedBlocks.load() || freeBlocks.load()) { reset(); return; } |
390 | if (bytesReserve == 0) bytesReserve = bytesAllocate; |
391 | freeBlocks = Block::create(device,bytesAllocate,bytesReserve,next: nullptr,atype); |
392 | estimatedSize = bytesEstimate; |
393 | initGrowSizeAndNumSlots(bytesEstimated: bytesEstimate,fast: true); |
394 | } |
395 | |
396 | /*! initializes the allocator */ |
397 | void init_estimate(size_t bytesEstimate) |
398 | { |
399 | internal_fix_used_blocks(); |
400 | if (usedBlocks.load() || freeBlocks.load()) { reset(); return; } |
401 | /* single allocator mode ? */ |
402 | estimatedSize = bytesEstimate; |
403 | //initGrowSizeAndNumSlots(bytesEstimate,false); |
404 | initGrowSizeAndNumSlots(bytesEstimated: bytesEstimate,fast: false); |
405 | |
406 | } |
407 | |
408 | /*! frees state not required after build */ |
409 | __forceinline void cleanup() |
410 | { |
411 | internal_fix_used_blocks(); |
412 | |
413 | /* unbind all thread local allocators */ |
414 | for (auto alloc : thread_local_allocators) alloc->unbind(alloc_i: this); |
415 | thread_local_allocators.clear(); |
416 | } |
417 | |
418 | /*! resets the allocator, memory blocks get reused */ |
419 | void reset () |
420 | { |
421 | internal_fix_used_blocks(); |
422 | |
423 | bytesUsed.store(i: 0); |
424 | bytesFree.store(i: 0); |
425 | bytesWasted.store(i: 0); |
426 | |
427 | /* reset all used blocks and move them to begin of free block list */ |
428 | while (usedBlocks.load() != nullptr) { |
429 | usedBlocks.load()->reset_block(); |
430 | Block* nextUsedBlock = usedBlocks.load()->next; |
431 | usedBlocks.load()->next = freeBlocks.load(); |
432 | freeBlocks = usedBlocks.load(); |
433 | usedBlocks = nextUsedBlock; |
434 | } |
435 | |
436 | /* remove all shared blocks as they are re-added during build */ |
437 | freeBlocks.store(p: Block::remove_shared_blocks(head: freeBlocks.load())); |
438 | |
439 | for (size_t i=0; i<MAX_THREAD_USED_BLOCK_SLOTS; i++) |
440 | { |
441 | threadUsedBlocks[i] = nullptr; |
442 | threadBlocks[i] = nullptr; |
443 | } |
444 | |
445 | /* unbind all thread local allocators */ |
446 | for (auto alloc : thread_local_allocators) alloc->unbind(alloc_i: this); |
447 | thread_local_allocators.clear(); |
448 | } |
449 | |
450 | /*! frees all allocated memory */ |
451 | __forceinline void clear() |
452 | { |
453 | cleanup(); |
454 | bytesUsed.store(i: 0); |
455 | bytesFree.store(i: 0); |
456 | bytesWasted.store(i: 0); |
457 | if (usedBlocks.load() != nullptr) usedBlocks.load()->clear_list(device); usedBlocks = nullptr; |
458 | if (freeBlocks.load() != nullptr) freeBlocks.load()->clear_list(device); freeBlocks = nullptr; |
459 | for (size_t i=0; i<MAX_THREAD_USED_BLOCK_SLOTS; i++) { |
460 | threadUsedBlocks[i] = nullptr; |
461 | threadBlocks[i] = nullptr; |
462 | } |
463 | primrefarray.clear(); |
464 | } |
465 | |
466 | __forceinline size_t incGrowSizeScale() |
467 | { |
468 | size_t scale = log2_grow_size_scale.fetch_add(i: 1)+1; |
469 | return size_t(1) << min(a: size_t(16),b: scale); |
470 | } |
471 | |
472 | /*! thread safe allocation of memory */ |
473 | void* malloc(size_t& bytes, size_t align, bool partial) |
474 | { |
475 | assert(align <= maxAlignment); |
476 | |
477 | while (true) |
478 | { |
479 | /* allocate using current block */ |
480 | size_t threadID = TaskScheduler::threadID(); |
481 | size_t slot = threadID & slotMask; |
482 | Block* myUsedBlocks = threadUsedBlocks[slot]; |
483 | if (myUsedBlocks) { |
484 | void* ptr = myUsedBlocks->malloc(device,bytes_in&: bytes,align,partial); |
485 | if (ptr) return ptr; |
486 | } |
487 | |
488 | /* throw error if allocation is too large */ |
489 | if (bytes > maxAllocationSize) |
490 | throw_RTCError(RTC_ERROR_UNKNOWN,"allocation is too large" ); |
491 | |
492 | /* parallel block creation in case of no freeBlocks, avoids single global mutex */ |
493 | if (likely(freeBlocks.load() == nullptr)) |
494 | { |
495 | Lock<SpinLock> lock(slotMutex[slot]); |
496 | if (myUsedBlocks == threadUsedBlocks[slot]) { |
497 | const size_t alignedBytes = (bytes+(align-1)) & ~(align-1); |
498 | const size_t allocSize = max(a: min(a: growSize,b: maxGrowSize),b: alignedBytes); |
499 | assert(allocSize >= bytes); |
500 | threadBlocks[slot] = threadUsedBlocks[slot] = Block::create(device,bytesAllocate: allocSize,bytesReserve: allocSize,next: threadBlocks[slot],atype); // FIXME: a large allocation might throw away a block here! |
501 | // FIXME: a direct allocation should allocate inside the block here, and not in the next loop! a different thread could do some allocation and make the large allocation fail. |
502 | } |
503 | continue; |
504 | } |
505 | |
506 | /* if this fails allocate new block */ |
507 | { |
508 | Lock<SpinLock> lock(mutex); |
509 | if (myUsedBlocks == threadUsedBlocks[slot]) |
510 | { |
511 | if (freeBlocks.load() != nullptr) { |
512 | Block* nextFreeBlock = freeBlocks.load()->next; |
513 | freeBlocks.load()->next = usedBlocks; |
514 | __memory_barrier(); |
515 | usedBlocks = freeBlocks.load(); |
516 | threadUsedBlocks[slot] = freeBlocks.load(); |
517 | freeBlocks = nextFreeBlock; |
518 | } else { |
519 | const size_t allocSize = min(a: growSize*incGrowSizeScale(),b: maxGrowSize); |
520 | usedBlocks = threadUsedBlocks[slot] = Block::create(device,bytesAllocate: allocSize,bytesReserve: allocSize,next: usedBlocks,atype); // FIXME: a large allocation should get delivered directly, like above! |
521 | } |
522 | } |
523 | } |
524 | } |
525 | } |
526 | |
527 | /*! add new block */ |
528 | void addBlock(void* ptr, ssize_t bytes) |
529 | { |
530 | Lock<SpinLock> lock(mutex); |
531 | const size_t = offsetof(Block,data[0]); |
532 | void* aptr = (void*) ((((size_t)ptr)+maxAlignment-1) & ~(maxAlignment-1)); |
533 | size_t ofs = (size_t) aptr - (size_t) ptr; |
534 | bytes -= ofs; |
535 | if (bytes < 4096) return; // ignore empty or very small blocks |
536 | freeBlocks = new (aptr) Block(SHARED,bytes-sizeof_Header,bytes-sizeof_Header,freeBlocks,ofs); |
537 | } |
538 | |
539 | /* special allocation only used from morton builder only a single time for each build */ |
540 | void* specialAlloc(size_t bytes) |
541 | { |
542 | assert(freeBlocks.load() != nullptr && freeBlocks.load()->getBlockAllocatedBytes() >= bytes); |
543 | return freeBlocks.load()->ptr(); |
544 | } |
545 | |
546 | struct Statistics |
547 | { |
548 | Statistics () |
549 | : bytesUsed(0), bytesFree(0), bytesWasted(0) {} |
550 | |
551 | Statistics (size_t bytesUsed, size_t bytesFree, size_t bytesWasted) |
552 | : bytesUsed(bytesUsed), bytesFree(bytesFree), bytesWasted(bytesWasted) {} |
553 | |
554 | Statistics (FastAllocator* alloc, AllocationType atype, bool huge_pages = false) |
555 | : bytesUsed(0), bytesFree(0), bytesWasted(0) |
556 | { |
557 | Block* usedBlocks = alloc->usedBlocks.load(); |
558 | Block* freeBlocks = alloc->freeBlocks.load(); |
559 | if (usedBlocks) bytesUsed += usedBlocks->getUsedBytes(atype,huge_pages); |
560 | if (freeBlocks) bytesFree += freeBlocks->getAllocatedBytes(atype,huge_pages); |
561 | if (usedBlocks) bytesFree += usedBlocks->getFreeBytes(atype,huge_pages); |
562 | if (freeBlocks) bytesWasted += freeBlocks->getWastedBytes(atype,huge_pages); |
563 | if (usedBlocks) bytesWasted += usedBlocks->getWastedBytes(atype,huge_pages); |
564 | } |
565 | |
566 | std::string str(size_t numPrimitives) |
567 | { |
568 | std::stringstream str; |
569 | str.setf(fmtfl: std::ios::fixed, mask: std::ios::floatfield); |
570 | str << "used = " << std::setw(7) << std::setprecision(3) << 1E-6f*bytesUsed << " MB, " |
571 | << "free = " << std::setw(7) << std::setprecision(3) << 1E-6f*bytesFree << " MB, " |
572 | << "wasted = " << std::setw(7) << std::setprecision(3) << 1E-6f*bytesWasted << " MB, " |
573 | << "total = " << std::setw(7) << std::setprecision(3) << 1E-6f*bytesAllocatedTotal() << " MB, " |
574 | << "#bytes/prim = " << std::setw(6) << std::setprecision(2) << double(bytesAllocatedTotal())/double(numPrimitives); |
575 | return str.str(); |
576 | } |
577 | |
578 | friend Statistics operator+ ( const Statistics& a, const Statistics& b) |
579 | { |
580 | return Statistics(a.bytesUsed+b.bytesUsed, |
581 | a.bytesFree+b.bytesFree, |
582 | a.bytesWasted+b.bytesWasted); |
583 | } |
584 | |
585 | size_t bytesAllocatedTotal() const { |
586 | return bytesUsed + bytesFree + bytesWasted; |
587 | } |
588 | |
589 | public: |
590 | size_t bytesUsed; |
591 | size_t bytesFree; |
592 | size_t bytesWasted; |
593 | }; |
594 | |
595 | Statistics getStatistics(AllocationType atype, bool huge_pages = false) { |
596 | return Statistics(this,atype,huge_pages); |
597 | } |
598 | |
599 | size_t getUsedBytes() { |
600 | return bytesUsed; |
601 | } |
602 | |
603 | size_t getWastedBytes() { |
604 | return bytesWasted; |
605 | } |
606 | |
607 | struct AllStatistics |
608 | { |
609 | AllStatistics (FastAllocator* alloc) |
610 | |
611 | : bytesUsed(alloc->bytesUsed), |
612 | bytesFree(alloc->bytesFree), |
613 | bytesWasted(alloc->bytesWasted), |
614 | stat_all(alloc,ANY_TYPE), |
615 | stat_malloc(alloc,ALIGNED_MALLOC), |
616 | stat_4K(alloc,OS_MALLOC,false), |
617 | stat_2M(alloc,OS_MALLOC,true), |
618 | stat_shared(alloc,SHARED) {} |
619 | |
620 | AllStatistics (size_t bytesUsed, |
621 | size_t bytesFree, |
622 | size_t bytesWasted, |
623 | Statistics stat_all, |
624 | Statistics stat_malloc, |
625 | Statistics stat_4K, |
626 | Statistics stat_2M, |
627 | Statistics stat_shared) |
628 | |
629 | : bytesUsed(bytesUsed), |
630 | bytesFree(bytesFree), |
631 | bytesWasted(bytesWasted), |
632 | stat_all(stat_all), |
633 | stat_malloc(stat_malloc), |
634 | stat_4K(stat_4K), |
635 | stat_2M(stat_2M), |
636 | stat_shared(stat_shared) {} |
637 | |
638 | friend AllStatistics operator+ (const AllStatistics& a, const AllStatistics& b) |
639 | { |
640 | return AllStatistics(a.bytesUsed+b.bytesUsed, |
641 | a.bytesFree+b.bytesFree, |
642 | a.bytesWasted+b.bytesWasted, |
643 | a.stat_all + b.stat_all, |
644 | a.stat_malloc + b.stat_malloc, |
645 | a.stat_4K + b.stat_4K, |
646 | a.stat_2M + b.stat_2M, |
647 | a.stat_shared + b.stat_shared); |
648 | } |
649 | |
650 | void print(size_t numPrimitives) |
651 | { |
652 | std::stringstream str0; |
653 | str0.setf(fmtfl: std::ios::fixed, mask: std::ios::floatfield); |
654 | str0 << " alloc : " |
655 | << "used = " << std::setw(7) << std::setprecision(3) << 1E-6f*bytesUsed << " MB, " |
656 | << " " |
657 | << "#bytes/prim = " << std::setw(6) << std::setprecision(2) << double(bytesUsed)/double(numPrimitives); |
658 | std::cout << str0.str() << std::endl; |
659 | |
660 | std::stringstream str1; |
661 | str1.setf(fmtfl: std::ios::fixed, mask: std::ios::floatfield); |
662 | str1 << " alloc : " |
663 | << "used = " << std::setw(7) << std::setprecision(3) << 1E-6f*bytesUsed << " MB, " |
664 | << "free = " << std::setw(7) << std::setprecision(3) << 1E-6f*bytesFree << " MB, " |
665 | << "wasted = " << std::setw(7) << std::setprecision(3) << 1E-6f*bytesWasted << " MB, " |
666 | << "total = " << std::setw(7) << std::setprecision(3) << 1E-6f*(bytesUsed+bytesFree+bytesWasted) << " MB, " |
667 | << "#bytes/prim = " << std::setw(6) << std::setprecision(2) << double(bytesUsed+bytesFree+bytesWasted)/double(numPrimitives); |
668 | std::cout << str1.str() << std::endl; |
669 | |
670 | std::cout << " total : " << stat_all.str(numPrimitives) << std::endl; |
671 | std::cout << " 4K : " << stat_4K.str(numPrimitives) << std::endl; |
672 | std::cout << " 2M : " << stat_2M.str(numPrimitives) << std::endl; |
673 | std::cout << " malloc: " << stat_malloc.str(numPrimitives) << std::endl; |
674 | std::cout << " shared: " << stat_shared.str(numPrimitives) << std::endl; |
675 | } |
676 | |
677 | private: |
678 | size_t bytesUsed; |
679 | size_t bytesFree; |
680 | size_t bytesWasted; |
681 | Statistics stat_all; |
682 | Statistics stat_malloc; |
683 | Statistics stat_4K; |
684 | Statistics stat_2M; |
685 | Statistics stat_shared; |
686 | }; |
687 | |
688 | void print_blocks() |
689 | { |
690 | std::cout << " estimatedSize = " << estimatedSize << ", slotMask = " << slotMask << ", use_single_mode = " << use_single_mode << ", maxGrowSize = " << maxGrowSize << ", defaultBlockSize = " << defaultBlockSize << std::endl; |
691 | |
692 | std::cout << " used blocks = " ; |
693 | if (usedBlocks.load() != nullptr) usedBlocks.load()->print_list(); |
694 | std::cout << "[END]" << std::endl; |
695 | |
696 | std::cout << " free blocks = " ; |
697 | if (freeBlocks.load() != nullptr) freeBlocks.load()->print_list(); |
698 | std::cout << "[END]" << std::endl; |
699 | } |
700 | |
701 | private: |
702 | |
703 | struct Block |
704 | { |
705 | static Block* create(MemoryMonitorInterface* device, size_t bytesAllocate, size_t bytesReserve, Block* next, AllocationType atype) |
706 | { |
707 | /* We avoid using os_malloc for small blocks as this could |
708 | * cause a risk of fragmenting the virtual address space and |
709 | * reach the limit of vm.max_map_count = 65k under Linux. */ |
710 | if (atype == OS_MALLOC && bytesAllocate < maxAllocationSize) |
711 | atype = ALIGNED_MALLOC; |
712 | |
713 | /* we need to additionally allocate some header */ |
714 | const size_t = offsetof(Block,data[0]); |
715 | bytesAllocate = sizeof_Header+bytesAllocate; |
716 | bytesReserve = sizeof_Header+bytesReserve; |
717 | |
718 | /* consume full 4k pages with using os_malloc */ |
719 | if (atype == OS_MALLOC) { |
720 | bytesAllocate = ((bytesAllocate+PAGE_SIZE-1) & ~(PAGE_SIZE-1)); |
721 | bytesReserve = ((bytesReserve +PAGE_SIZE-1) & ~(PAGE_SIZE-1)); |
722 | } |
723 | |
724 | /* either use alignedMalloc or os_malloc */ |
725 | void *ptr = nullptr; |
726 | if (atype == ALIGNED_MALLOC) |
727 | { |
728 | /* special handling for default block size */ |
729 | if (bytesAllocate == (2*PAGE_SIZE_2M)) |
730 | { |
731 | const size_t alignment = maxAlignment; |
732 | if (device) device->memoryMonitor(bytes: bytesAllocate+alignment,post: false); |
733 | ptr = alignedMalloc(size: bytesAllocate,align: alignment); |
734 | |
735 | /* give hint to transparently convert these pages to 2MB pages */ |
736 | const size_t ptr_aligned_begin = ((size_t)ptr) & ~size_t(PAGE_SIZE_2M-1); |
737 | os_advise(ptr: (void*)(ptr_aligned_begin + 0),PAGE_SIZE_2M); // may fail if no memory mapped before block |
738 | os_advise(ptr: (void*)(ptr_aligned_begin + 1*PAGE_SIZE_2M),PAGE_SIZE_2M); |
739 | os_advise(ptr: (void*)(ptr_aligned_begin + 2*PAGE_SIZE_2M),PAGE_SIZE_2M); // may fail if no memory mapped after block |
740 | |
741 | return new (ptr) Block(ALIGNED_MALLOC,bytesAllocate-sizeof_Header,bytesAllocate-sizeof_Header,next,alignment); |
742 | } |
743 | else |
744 | { |
745 | const size_t alignment = maxAlignment; |
746 | if (device) device->memoryMonitor(bytes: bytesAllocate+alignment,post: false); |
747 | ptr = alignedMalloc(size: bytesAllocate,align: alignment); |
748 | return new (ptr) Block(ALIGNED_MALLOC,bytesAllocate-sizeof_Header,bytesAllocate-sizeof_Header,next,alignment); |
749 | } |
750 | } |
751 | else if (atype == OS_MALLOC) |
752 | { |
753 | if (device) device->memoryMonitor(bytes: bytesAllocate,post: false); |
754 | bool huge_pages; ptr = os_malloc(bytes: bytesReserve,hugepages&: huge_pages); |
755 | return new (ptr) Block(OS_MALLOC,bytesAllocate-sizeof_Header,bytesReserve-sizeof_Header,next,0,huge_pages); |
756 | } |
757 | else |
758 | assert(false); |
759 | |
760 | return NULL; |
761 | } |
762 | |
763 | Block (AllocationType atype, size_t bytesAllocate, size_t bytesReserve, Block* next, size_t wasted, bool huge_pages = false) |
764 | : cur(0), allocEnd(bytesAllocate), reserveEnd(bytesReserve), next(next), wasted(wasted), atype(atype), huge_pages(huge_pages) |
765 | { |
766 | assert((((size_t)&data[0]) & (maxAlignment-1)) == 0); |
767 | } |
768 | |
769 | static Block* remove_shared_blocks(Block* head) |
770 | { |
771 | Block** prev_next = &head; |
772 | for (Block* block = head; block; block = block->next) { |
773 | if (block->atype == SHARED) *prev_next = block->next; |
774 | else prev_next = &block->next; |
775 | } |
776 | return head; |
777 | } |
778 | |
779 | void clear_list(MemoryMonitorInterface* device) |
780 | { |
781 | Block* block = this; |
782 | while (block) { |
783 | Block* next = block->next; |
784 | block->clear_block(device); |
785 | block = next; |
786 | } |
787 | } |
788 | |
789 | void clear_block (MemoryMonitorInterface* device) |
790 | { |
791 | const size_t = offsetof(Block,data[0]); |
792 | const ssize_t sizeof_Alloced = wasted+sizeof_Header+getBlockAllocatedBytes(); |
793 | |
794 | if (atype == ALIGNED_MALLOC) { |
795 | alignedFree(ptr: this); |
796 | if (device) device->memoryMonitor(bytes: -sizeof_Alloced,post: true); |
797 | } |
798 | |
799 | else if (atype == OS_MALLOC) { |
800 | size_t sizeof_This = sizeof_Header+reserveEnd; |
801 | os_free(ptr: this,bytes: sizeof_This,hugepages: huge_pages); |
802 | if (device) device->memoryMonitor(bytes: -sizeof_Alloced,post: true); |
803 | } |
804 | |
805 | else /* if (atype == SHARED) */ { |
806 | } |
807 | } |
808 | |
809 | void* malloc(MemoryMonitorInterface* device, size_t& bytes_in, size_t align, bool partial) |
810 | { |
811 | size_t bytes = bytes_in; |
812 | assert(align <= maxAlignment); |
813 | bytes = (bytes+(align-1)) & ~(align-1); |
814 | if (unlikely(cur+bytes > reserveEnd && !partial)) return nullptr; |
815 | const size_t i = cur.fetch_add(i: bytes); |
816 | if (unlikely(i+bytes > reserveEnd && !partial)) return nullptr; |
817 | if (unlikely(i > reserveEnd)) return nullptr; |
818 | bytes_in = bytes = min(a: bytes,b: reserveEnd-i); |
819 | |
820 | if (i+bytes > allocEnd) { |
821 | if (device) device->memoryMonitor(bytes: i+bytes-max(a: i,b: allocEnd),post: true); |
822 | } |
823 | return &data[i]; |
824 | } |
825 | |
826 | void* ptr() { |
827 | return &data[cur]; |
828 | } |
829 | |
830 | void reset_block () |
831 | { |
832 | allocEnd = max(a: allocEnd,b: (size_t)cur); |
833 | cur = 0; |
834 | } |
835 | |
836 | size_t getBlockUsedBytes() const { |
837 | return min(a: size_t(cur),b: reserveEnd); |
838 | } |
839 | |
840 | size_t getBlockFreeBytes() const { |
841 | return getBlockAllocatedBytes() - getBlockUsedBytes(); |
842 | } |
843 | |
844 | size_t getBlockAllocatedBytes() const { |
845 | return min(a: max(a: allocEnd,b: size_t(cur)),b: reserveEnd); |
846 | } |
847 | |
848 | size_t getBlockWastedBytes() const { |
849 | const size_t = offsetof(Block,data[0]); |
850 | return sizeof_Header + wasted; |
851 | } |
852 | |
853 | size_t getBlockReservedBytes() const { |
854 | return reserveEnd; |
855 | } |
856 | |
857 | bool hasType(AllocationType atype_i, bool huge_pages_i) const |
858 | { |
859 | if (atype_i == ANY_TYPE ) return true; |
860 | else if (atype == OS_MALLOC) return atype_i == atype && huge_pages_i == huge_pages; |
861 | else return atype_i == atype; |
862 | } |
863 | |
864 | size_t getUsedBytes(AllocationType atype, bool huge_pages = false) const { |
865 | size_t bytes = 0; |
866 | for (const Block* block = this; block; block = block->next) { |
867 | if (!block->hasType(atype_i: atype,huge_pages_i: huge_pages)) continue; |
868 | bytes += block->getBlockUsedBytes(); |
869 | } |
870 | return bytes; |
871 | } |
872 | |
873 | size_t getFreeBytes(AllocationType atype, bool huge_pages = false) const { |
874 | size_t bytes = 0; |
875 | for (const Block* block = this; block; block = block->next) { |
876 | if (!block->hasType(atype_i: atype,huge_pages_i: huge_pages)) continue; |
877 | bytes += block->getBlockFreeBytes(); |
878 | } |
879 | return bytes; |
880 | } |
881 | |
882 | size_t getWastedBytes(AllocationType atype, bool huge_pages = false) const { |
883 | size_t bytes = 0; |
884 | for (const Block* block = this; block; block = block->next) { |
885 | if (!block->hasType(atype_i: atype,huge_pages_i: huge_pages)) continue; |
886 | bytes += block->getBlockWastedBytes(); |
887 | } |
888 | return bytes; |
889 | } |
890 | |
891 | size_t getAllocatedBytes(AllocationType atype, bool huge_pages = false) const { |
892 | size_t bytes = 0; |
893 | for (const Block* block = this; block; block = block->next) { |
894 | if (!block->hasType(atype_i: atype,huge_pages_i: huge_pages)) continue; |
895 | bytes += block->getBlockAllocatedBytes(); |
896 | } |
897 | return bytes; |
898 | } |
899 | |
900 | void print_list () |
901 | { |
902 | for (const Block* block = this; block; block = block->next) |
903 | block->print_block(); |
904 | } |
905 | |
906 | void print_block() const |
907 | { |
908 | if (atype == ALIGNED_MALLOC) std::cout << "A" ; |
909 | else if (atype == OS_MALLOC) std::cout << "O" ; |
910 | else if (atype == SHARED) std::cout << "S" ; |
911 | if (huge_pages) std::cout << "H" ; |
912 | size_t bytesUsed = getBlockUsedBytes(); |
913 | size_t bytesFree = getBlockFreeBytes(); |
914 | size_t bytesWasted = getBlockWastedBytes(); |
915 | std::cout << "[" << bytesUsed << ", " << bytesFree << ", " << bytesWasted << "] " ; |
916 | } |
917 | |
918 | public: |
919 | std::atomic<size_t> cur; //!< current location of the allocator |
920 | std::atomic<size_t> allocEnd; //!< end of the allocated memory region |
921 | std::atomic<size_t> reserveEnd; //!< end of the reserved memory region |
922 | Block* next; //!< pointer to next block in list |
923 | size_t wasted; //!< amount of memory wasted through block alignment |
924 | AllocationType atype; //!< allocation mode of the block |
925 | bool huge_pages; //!< whether the block uses huge pages |
926 | char align[maxAlignment-5*sizeof(size_t)-sizeof(AllocationType)-sizeof(bool)]; //!< align data to maxAlignment |
927 | char data[1]; //!< here starts memory to use for allocations |
928 | }; |
929 | |
930 | private: |
931 | Device* device; |
932 | SpinLock mutex; |
933 | size_t slotMask; |
934 | std::atomic<Block*> threadUsedBlocks[MAX_THREAD_USED_BLOCK_SLOTS]; |
935 | std::atomic<Block*> usedBlocks; |
936 | std::atomic<Block*> freeBlocks; |
937 | |
938 | std::atomic<Block*> threadBlocks[MAX_THREAD_USED_BLOCK_SLOTS]; |
939 | SpinLock slotMutex[MAX_THREAD_USED_BLOCK_SLOTS]; |
940 | |
941 | bool use_single_mode; |
942 | size_t defaultBlockSize; |
943 | size_t estimatedSize; |
944 | size_t growSize; |
945 | size_t maxGrowSize; |
946 | std::atomic<size_t> log2_grow_size_scale; //!< log2 of scaling factor for grow size // FIXME: remove |
947 | std::atomic<size_t> bytesUsed; |
948 | std::atomic<size_t> bytesFree; |
949 | std::atomic<size_t> bytesWasted; |
950 | static __thread ThreadLocal2* thread_local_allocator2; |
951 | static SpinLock s_thread_local_allocators_lock; |
952 | static std::vector<std::unique_ptr<ThreadLocal2>> s_thread_local_allocators; |
953 | SpinLock thread_local_allocators_lock; |
954 | std::vector<ThreadLocal2*> thread_local_allocators; |
955 | AllocationType atype; |
956 | mvector<PrimRef> primrefarray; //!< primrefarray used to allocate nodes |
957 | }; |
958 | } |
959 | |