tessellation_cache.h source code [qtquick3d/src/3rdparty/embree/kernels/subdiv/tessellation_cache.h]

1	// Copyright 2009-2021 Intel Corporation
2	// SPDX-License-Identifier: Apache-2.0
3
4	#pragma once
5
6	#include "../common/default.h"
7
8	/ force a complete cache invalidation when running out of allocation space /
9	#define FORCE_SIMPLE_FLUSH 0
10
11	#define THREAD_BLOCK_ATOMIC_ADD 4
12
13	#if defined(DEBUG)
14	#define CACHE_STATS(x)
15	#else
16	#define CACHE_STATS(x)
17	#endif
18
19	namespace embree
20	{
21	class SharedTessellationCacheStats
22	{
23	public:
24	/ stats /
25	static std::atomic<size_t> cache_accesses;
26	static std::atomic<size_t> cache_hits;
27	static std::atomic<size_t> cache_misses;
28	static std::atomic<size_t> cache_flushes;
29	static size_t cache_num_patches;
30	__aligned(`64`) static SpinLock mtx;
31
32	/ print stats for debugging /
33	static void printStats();
34	static void clearStats();
35	};
36
37	void resizeTessellationCache(size_t new_size);
38	void resetTessellationCache();
39
40	////////////////////////////////////////////////////////////////////////////////
41	////////////////////////////////////////////////////////////////////////////////
42	////////////////////////////////////////////////////////////////////////////////
43
44	struct __aligned(`64`) ThreadWorkState
45	{
46	ALIGNED_STRUCT_(`64`);
47
48	std::atomic<size_t> counter;
49	ThreadWorkState* next;
50	bool allocated;
51
52	__forceinline ThreadWorkState(bool allocated = false)
53	: counter (`0`), next(nullptr), allocated(allocated)
54	{
55	assert( ((size_t)this % `64`) == `0` );
56	}
57	};
58
59	class __aligned(`64`) SharedLazyTessellationCache
60	{
61	public:
62
63	static const size_t NUM_CACHE_SEGMENTS = `8`;
64	static const size_t NUM_PREALLOC_THREAD_WORK_STATES = `512`;
65	static const size_t COMMIT_INDEX_SHIFT = `32`+`8`;
66	#if defined(__64BIT__)
67	static const size_t REF_TAG_MASK = `0xffffffffff`;
68	#else
69	static const size_t REF_TAG_MASK = `0x7FFFFFFF`;
70	#endif
71	static const size_t MAX_TESSELLATION_CACHE_SIZE = REF_TAG_MASK+`1`;
72	static const size_t BLOCK_SIZE = `64`;
73
74
75	/! Per thread tessellation ref cache /
76	static __thread ThreadWorkState* init_t_state;
77	static ThreadWorkState* current_t_state;
78
79	static __forceinline ThreadWorkState *threadState()
80	{
81	if (unlikely(!init_t_state))
82	/ sets init_t_state, can't return pointer due to macosx icc bug/
83	SharedLazyTessellationCache::sharedLazyTessellationCache.getNextRenderThreadWorkState();
84	return init_t_state;
85	}
86
87	struct Tag
88	{
89	__forceinline Tag() : data (`0`) {}
90
91	__forceinline Tag(void* ptr, size_t combinedTime) {
92	init(ptr,combinedTime);
93	}
94
95	__forceinline Tag(size_t ptr, size_t combinedTime) {
96	init(ptr: (void*)ptr,combinedTime);
97	}
98
99	__forceinline void init(void* ptr, size_t combinedTime)
100	{
101	if (ptr == nullptr) {
102	data = `0`;
103	return;
104	}
105	int64_t new_root_ref = (int64_t) ptr;
106	new_root_ref -= (int64_t)SharedLazyTessellationCache::sharedLazyTessellationCache.getDataPtr();
107	assert( new_root_ref <= (int64_t)REF_TAG_MASK );
108	new_root_ref \|= (int64_t)combinedTime << COMMIT_INDEX_SHIFT;
109	data = new_root_ref;
110	}
111
112	__forceinline int64_t get() const { return data.load(); }
113	__forceinline void set( int64_t v ) { data.store(i: v); }
114	__forceinline void reset() { data.store(i: `0`); }
115
116	private:
117	atomic<int64_t> data;
118	};
119
120	static __forceinline size_t extractCommitIndex(const int64_t v) { return v >> SharedLazyTessellationCache::COMMIT_INDEX_SHIFT; }
121
122	struct CacheEntry
123	{
124	Tag tag;
125	SpinLock mutex;
126	};
127
128	private:
129
130	float *data;
131	bool hugepages;
132	size_t size;
133	size_t maxBlocks;
134	ThreadWorkState *threadWorkState;
135
136	__aligned(`64`) std::atomic<size_t> localTime;
137	__aligned(`64`) std::atomic<size_t> next_block;
138	__aligned(`64`) SpinLock reset_state;
139	__aligned(`64`) SpinLock linkedlist_mtx;
140	__aligned(`64`) std::atomic<size_t> switch_block_threshold;
141	__aligned(`64`) std::atomic<size_t> numRenderThreads;
142
143
144	public:
145
146
147	SharedLazyTessellationCache();
148	~SharedLazyTessellationCache();
149
150	void getNextRenderThreadWorkState();
151
152	__forceinline size_t maxAllocSize() const {
153	return switch_block_threshold;
154	}
155
156	__forceinline size_t getCurrentIndex() { return localTime.load(); }
157	__forceinline void addCurrentIndex(const size_t i=`1`) { localTime.fetch_add(i: i); }
158
159	__forceinline size_t getTime(const size_t globalTime) {
160	return localTime.load()+NUM_CACHE_SEGMENTS*globalTime;
161	}
162
163
164	__forceinline size_t lockThread (ThreadWorkState *const t_state, const ssize_t plus=`1`) { return t_state->counter.fetch_add(i: plus); }
165	__forceinline size_t unlockThread(ThreadWorkState *const t_state, const ssize_t plus=-`1`) { assert(isLocked(t_state)); return t_state->counter.fetch_add(i: plus); }
166
167	__forceinline bool isLocked(ThreadWorkState *const t_state) { return t_state->counter.load() != `0`; }
168
169	static __forceinline void lock () { sharedLazyTessellationCache.lockThread(t_state: threadState()); }
170	static __forceinline void unlock() { sharedLazyTessellationCache.unlockThread(t_state: threadState()); }
171	static __forceinline bool isLocked() { return sharedLazyTessellationCache.isLocked(t_state: threadState()); }
172	static __forceinline size_t getState() { return threadState()->counter.load(); }
173	static __forceinline void lockThreadLoop() { sharedLazyTessellationCache.lockThreadLoop(t_state: threadState()); }
174
175	static __forceinline size_t getTCacheTime(const size_t globalTime) {
176	return sharedLazyTessellationCache.getTime(globalTime);
177	}
178
179	/ per thread lock /
180	__forceinline void lockThreadLoop (ThreadWorkState *const t_state)
181	{
182	while(`1`)
183	{
184	size_t lock = SharedLazyTessellationCache::sharedLazyTessellationCache.lockThread(t_state,plus: `1`);
185	if (unlikely(lock >= THREAD_BLOCK_ATOMIC_ADD))
186	{
187	/ lock failed wait until sync phase is over /
188	sharedLazyTessellationCache.unlockThread(t_state,plus: -`1`);
189	sharedLazyTessellationCache.waitForUsersLessEqual(t_state,users: `0`);
190	}
191	else
192	break;
193	}
194	}
195
196	static __forceinline void* lookup(CacheEntry& entry, size_t globalTime)
197	{
198	const int64_t subdiv_patch_root_ref = entry.tag.get();
199	CACHE_STATS(SharedTessellationCacheStats::cache_accesses++);
200
201	if (likely(subdiv_patch_root_ref != `0`))
202	{
203	const size_t subdiv_patch_root = (subdiv_patch_root_ref & REF_TAG_MASK) + (size_t)sharedLazyTessellationCache.getDataPtr();
204	const size_t subdiv_patch_cache_index = extractCommitIndex(v: subdiv_patch_root_ref);
205
206	if (likely( sharedLazyTessellationCache.validCacheIndex(subdiv_patch_cache_index,globalTime) ))
207	{
208	CACHE_STATS(SharedTessellationCacheStats::cache_hits++);
209	return (void*) subdiv_patch_root;
210	}
211	}
212	CACHE_STATS(SharedTessellationCacheStats::cache_misses++);
213	return nullptr;
214	}
215
216	template<typename Constructor>
217	static __forceinline auto lookup (CacheEntry& entry, size_t globalTime, const Constructor constructor, const bool before=false) -> decltype(constructor())
218	{
219	ThreadWorkState *t_state = SharedLazyTessellationCache::threadState();
220
221	while (true)
222	{
223	sharedLazyTessellationCache.lockThreadLoop(t_state);
224	void* patch = SharedLazyTessellationCache::lookup(entry,globalTime);
225	if (patch) return (decltype(constructor())) patch;
226
227	if (entry.mutex.try_lock())
228	{
229	if (!validTag(tag: entry.tag,globalTime))
230	{
231	auto timeBefore = sharedLazyTessellationCache.getTime(globalTime);
232	auto ret = constructor(); // thread is locked here!
233	assert(ret);
234	/ this should never return nullptr /
235	auto timeAfter = sharedLazyTessellationCache.getTime(globalTime);
236	auto time = before ? timeBefore : timeAfter;
237	__memory_barrier();
238	entry.tag = SharedLazyTessellationCache::Tag(ret,time);
239	__memory_barrier();
240	entry.mutex.unlock();
241	return ret;
242	}
243	entry.mutex.unlock();
244	}
245	SharedLazyTessellationCache::sharedLazyTessellationCache.unlockThread(t_state);
246	}
247	}
248
249	__forceinline bool validCacheIndex(const size_t i, const size_t globalTime)
250	{
251	#if FORCE_SIMPLE_FLUSH == 1
252	return i == getTime(globalTime);
253	#else
254	return i+(NUM_CACHE_SEGMENTS-`1`) >= getTime(globalTime);
255	#endif
256	}
257
258	static __forceinline bool validTime(const size_t oldtime, const size_t newTime)
259	{
260	return oldtime+(NUM_CACHE_SEGMENTS-`1`) >= newTime;
261	}
262
263
264	static __forceinline bool validTag(const Tag& tag, size_t globalTime)
265	{
266	const int64_t subdiv_patch_root_ref = tag.get();
267	if (subdiv_patch_root_ref == `0`) return false;
268	const size_t subdiv_patch_cache_index = extractCommitIndex(v: subdiv_patch_root_ref);
269	return sharedLazyTessellationCache.validCacheIndex(i: subdiv_patch_cache_index,globalTime);
270	}
271
272	void waitForUsersLessEqual(ThreadWorkState *const t_state,
273	const unsigned int users);
274
275	__forceinline size_t alloc(const size_t blocks)
276	{
277	if (unlikely(blocks >= switch_block_threshold))
278	throw_RTCError(RTC_ERROR_INVALID_OPERATION,"allocation exceeds size of tessellation cache segment");
279
280	assert(blocks < switch_block_threshold);
281	size_t index = next_block.fetch_add(i: blocks);
282	if (unlikely(index + blocks >= switch_block_threshold)) return (size_t)-`1`;
283	return index;
284	}
285
286	static __forceinline void* malloc(const size_t bytes)
287	{
288	size_t block_index = -`1`;
289	ThreadWorkState *const t_state = threadState();
290	while (true)
291	{
292	block_index = sharedLazyTessellationCache.alloc(blocks: (bytes+BLOCK_SIZE-`1`)/BLOCK_SIZE);
293	if (block_index == (size_t)-`1`)
294	{
295	sharedLazyTessellationCache.unlockThread(t_state);
296	sharedLazyTessellationCache.allocNextSegment();
297	sharedLazyTessellationCache.lockThread(t_state);
298	continue;
299	}
300	break;
301	}
302	return sharedLazyTessellationCache.getBlockPtr(block_index);
303	}
304
305	__forceinline void getBlockPtr(const* size_t block_index)
306	{
307	assert(block_index < maxBlocks);
308	assert(data);
309	assert(block_index*`16` <= size);
310	return (void)&data[block_index`16`];
311	}
312
313	__forceinline void* getDataPtr() { return data; }
314	__forceinline size_t getNumUsedBytes() { return next_block * BLOCK_SIZE; }
315	__forceinline size_t getMaxBlocks() { return maxBlocks; }
316	__forceinline size_t getSize() { return size; }
317
318	void allocNextSegment();
319	void realloc(const size_t newSize);
320
321	void reset();
322
323	static SharedLazyTessellationCache sharedLazyTessellationCache;
324	};
325	}
326

Provided by KDAB

Definitions

source code of qtquick3d/src/3rdparty/embree/kernels/subdiv/tessellation_cache.h