bvh_builder_morton.h source code [qtquick3d/src/3rdparty/embree/kernels/builders/bvh_builder_morton.h]

1	// Copyright 2009-2021 Intel Corporation
2	// SPDX-License-Identifier: Apache-2.0
3
4	#pragma once
5
6	#include "../common/builder.h"
7	#include "../../common/algorithms/parallel_reduce.h"
8
9	namespace embree
10	{
11	namespace isa
12	{
13	struct BVHBuilderMorton
14	{
15	static const size_t MAX_BRANCHING_FACTOR = `8`; //!< maximum supported BVH branching factor
16	static const size_t MIN_LARGE_LEAF_LEVELS = `8`; //!< create balanced tree of we are that many levels before the maximum tree depth
17
18	/! settings for morton builder /
19	struct Settings
20	{
21	/! default settings /
22	Settings ()
23	: branchingFactor(`2`), maxDepth(`32`), minLeafSize(`1`), maxLeafSize(`7`), singleThreadThreshold(`1024`) {}
24
25	/! initialize settings from API settings /
26	Settings (const RTCBuildArguments& settings)
27	: branchingFactor(`2`), maxDepth(`32`), minLeafSize(`1`), maxLeafSize(`7`), singleThreadThreshold(`1024`)
28	{
29	if (RTC_BUILD_ARGUMENTS_HAS(settings,maxBranchingFactor)) branchingFactor = settings.maxBranchingFactor;
30	if (RTC_BUILD_ARGUMENTS_HAS(settings,maxDepth )) maxDepth = settings.maxDepth;
31	if (RTC_BUILD_ARGUMENTS_HAS(settings,minLeafSize )) minLeafSize = settings.minLeafSize;
32	if (RTC_BUILD_ARGUMENTS_HAS(settings,maxLeafSize )) maxLeafSize = settings.maxLeafSize;
33
34	minLeafSize = min(a: minLeafSize,b: maxLeafSize);
35	}
36
37	Settings (size_t branchingFactor, size_t maxDepth, size_t minLeafSize, size_t maxLeafSize, size_t singleThreadThreshold)
38	: branchingFactor(branchingFactor), maxDepth(maxDepth), minLeafSize(minLeafSize), maxLeafSize(maxLeafSize), singleThreadThreshold(singleThreadThreshold)
39	{
40	minLeafSize = min(a: minLeafSize,b: maxLeafSize);
41	}
42
43	public:
44	size_t branchingFactor; //!< branching factor of BVH to build
45	size_t maxDepth; //!< maximum depth of BVH to build
46	size_t minLeafSize; //!< minimum size of a leaf
47	size_t maxLeafSize; //!< maximum size of a leaf
48	size_t singleThreadThreshold; //!< threshold when we switch to single threaded build
49	};
50
51	/! Build primitive consisting of morton code and primitive ID. /
52	struct __aligned(`8`) BuildPrim
53	{
54	union {
55	struct {
56	unsigned int code; //!< morton code
57	unsigned int index; //!< i'th primitive
58	};
59	uint64_t t;
60	};
61
62	/! interface for radix sort /
63	__forceinline operator unsigned() const { return code; }
64
65	/! interface for standard sort /
66	__forceinline bool operator<(const BuildPrim &m) const { return code < m.code; }
67	};
68
69	/! maps bounding box to morton code /
70	struct MortonCodeMapping
71	{
72	static const size_t LATTICE_BITS_PER_DIM = `10`;
73	static const size_t LATTICE_SIZE_PER_DIM = size_t(`1`) << LATTICE_BITS_PER_DIM;
74
75	vfloat4 base;
76	vfloat4 scale;
77
78	__forceinline MortonCodeMapping(const BBox3fa& bounds)
79	{
80	base = (vfloat4)bounds.lower;
81	const vfloat4 diag = (vfloat4)bounds.upper - (vfloat4)bounds.lower;
82	scale = select(m: diag > vfloat4 (`1E-19f`), t: rcp(a: diag) * vfloat4 (LATTICE_SIZE_PER_DIM * `0.99f`),f: vfloat4 (`0.0f`));
83	}
84
85	__forceinline const vint4 bin (const BBox3fa& box) const
86	{
87	const vfloat4 lower = (vfloat4)box.lower;
88	const vfloat4 upper = (vfloat4)box.upper;
89	const vfloat4 centroid = lower +upper;
90	return vint4 ((centroid -base)*scale);
91	}
92
93	__forceinline unsigned int code (const BBox3fa& box) const
94	{
95	const vint4 binID = bin(box);
96	const unsigned int x = extract<`0`>(b: binID);
97	const unsigned int y = extract<`1`>(b: binID);
98	const unsigned int z = extract<`2`>(b: binID);
99	const unsigned int xyz = bitInterleave(xin: x,yin: y,zin: z);
100	return xyz;
101	}
102	};
103
104	#if defined (__AVX2__)
105
106	/! for AVX2 there is a fast scalar bitInterleave /
107	struct MortonCodeGenerator
108	{
109	__forceinline MortonCodeGenerator(const MortonCodeMapping& mapping, BuildPrim* dest)
110	: mapping(mapping), dest(dest) {}
111
112	__forceinline void operator() (const BBox3fa& b, const unsigned index)
113	{
114	dest->index = index;
115	dest->code = mapping.code(b);
116	dest++;
117	}
118
119	public:
120	const MortonCodeMapping mapping;
121	BuildPrim* dest;
122	size_t currentID;
123	};
124
125	#else
126
127	/! before AVX2 is it better to use the SSE version of bitInterleave /
128	struct MortonCodeGenerator
129	{
130	__forceinline MortonCodeGenerator(const MortonCodeMapping& mapping, BuildPrim* dest)
131	: mapping (mapping), dest(dest), currentID(`0`), slots(`0`), ax (`0`), ay (`0`), az (`0`), ai (`0`) {}
132
133	__forceinline ~MortonCodeGenerator()
134	{
135	if (slots != `0`)
136	{
137	const vint4 code = bitInterleave(xin: ax,yin: ay,zin: az);
138	for (size_t i=`0`; i<slots; i++) {
139	dest[currentID-slots+i].index = ai [i];
140	dest[currentID-slots+i].code = code [i];
141	}
142	}
143	}
144
145	__forceinline void operator() (const BBox3fa& b, const unsigned index)
146	{
147	const vint4 binID = mapping.bin(box: b);
148	ax [slots] = extract<`0`>(b: binID);
149	ay [slots] = extract<`1`>(b: binID);
150	az [slots] = extract<`2`>(b: binID);
151	ai [slots] = index;
152	slots++;
153	currentID++;
154
155	if (slots == `4`)
156	{
157	const vint4 code = bitInterleave(xin: ax,yin: ay,zin: az);
158	vint4::storeu(ptr: &dest[currentID-`4`],v: unpacklo(a: code,b: ai));
159	vint4::storeu(ptr: &dest[currentID-`2`],v: unpackhi(a: code,b: ai));
160	slots = `0`;
161	}
162	}
163
164	public:
165	const MortonCodeMapping mapping;
166	BuildPrim* dest;
167	size_t currentID;
168	size_t slots;
169	vint4 ax, ay, az, ai;
170	};
171
172	#endif
173
174	template<
175	typename ReductionTy,
176	typename Allocator,
177	typename CreateAllocator,
178	typename CreateNodeFunc,
179	typename SetNodeBoundsFunc,
180	typename CreateLeafFunc,
181	typename CalculateBounds,
182	typename ProgressMonitor>
183
184	class BuilderT : private Settings
185	{
186	ALIGNED_CLASS_(`16`);
187
188	public:
189
190	BuilderT (CreateAllocator& createAllocator,
191	CreateNodeFunc& createNode,
192	SetNodeBoundsFunc& setBounds,
193	CreateLeafFunc& createLeaf,
194	CalculateBounds& calculateBounds,
195	ProgressMonitor& progressMonitor,
196	const Settings& settings)
197
198	: Settings(settings),
199	createAllocator(createAllocator),
200	createNode(createNode),
201	setBounds(setBounds),
202	createLeaf(createLeaf),
203	calculateBounds(calculateBounds),
204	progressMonitor(progressMonitor),
205	morton(nullptr) {}
206
207	ReductionTy createLargeLeaf(size_t depth, const range<unsigned>& current, Allocator alloc)
208	{
209	/ this should never occur but is a fatal error /
210	if (depth > maxDepth)
211	throw_RTCError(RTC_ERROR_UNKNOWN,"depth limit reached");
212
213	/ create leaf for few primitives /
214	if (current.size() <= maxLeafSize)
215	return createLeaf(current,alloc);
216
217	/ fill all children by always splitting the largest one /
218	range<unsigned> children[MAX_BRANCHING_FACTOR];
219	size_t numChildren = `1`;
220	children[`0`] = current;
221
222	do {
223
224	/ find best child with largest number of primitives /
225	size_t bestChild = -`1`;
226	size_t bestSize = `0`;
227	for (size_t i=`0`; i<numChildren; i++)
228	{
229	/ ignore leaves as they cannot get split /
230	if (children[i].size() <= maxLeafSize)
231	continue;
232
233	/ remember child with largest size /
234	if (children[i].size() > bestSize) {
235	bestSize = children[i].size();
236	bestChild = i;
237	}
238	}
239	if (bestChild == size_t(-`1`)) break;
240
241	/! split best child into left and right child /
242	auto split = children[bestChild].split();
243
244	/ add new children left and right /
245	children[bestChild] = children[numChildren-`1`];
246	children[numChildren-`1`] = split.first;
247	children[numChildren+`0`] = split.second;
248	numChildren++;
249
250	} while (numChildren < branchingFactor);
251
252	/ create node /
253	auto node = createNode(alloc,numChildren);
254
255	/ recurse into each child /
256	ReductionTy bounds[MAX_BRANCHING_FACTOR];
257	for (size_t i=`0`; i<numChildren; i++)
258	bounds[i] = createLargeLeaf(depth: depth+`1`,current: children[i],alloc);
259
260	return setBounds(node,bounds,numChildren);
261	}
262
263	/! recreates morton codes when reaching a region where all codes are identical /
264	__noinline void recreateMortonCodes(const range<unsigned>& current) const
265	{
266	/ fast path for small ranges /
267	if (likely(current.size() < `1024`))
268	{
269	/! recalculate centroid bounds /
270	BBox3fa centBounds(empty);
271	for (size_t i=current.begin(); i<current.end(); i++)
272	centBounds.extend(center2(calculateBounds(morton[i])));
273
274	/ recalculate morton codes /
275	MortonCodeMapping mapping(centBounds);
276	for (size_t i=current.begin(); i<current.end(); i++)
277	morton[i].code = mapping.code(box: calculateBounds(morton[i]));
278
279	/ sort morton codes /
280	std::sort(first: morton+current.begin(),last: morton+current.end());
281	}
282	else
283	{
284	/! recalculate centroid bounds /
285	auto calculateCentBounds = [&] ( const range<unsigned>& r ) {
286	BBox3fa centBounds = empty;
287	for (size_t i=r.begin(); i<r.end(); i++)
288	centBounds.extend(center2(calculateBounds(morton[i])));
289	return centBounds;
290	};
291	const BBox3fa centBounds = parallel_reduce(current.begin(), current.end(), unsigned(`1024`),
292	BBox3fa (empty), calculateCentBounds, BBox3fa::merge);
293
294	/ recalculate morton codes /
295	MortonCodeMapping mapping(centBounds);
296	parallel_for(current.begin(), current.end(), unsigned(`1024`), [&] ( const range<unsigned>& r ) {
297	for (size_t i=r.begin(); i<r.end(); i++) {
298	morton[i].code = mapping.code(box: calculateBounds(morton[i]));
299	}
300	});
301
302	/! sort morton codes /
303	#if defined(TASKING_TBB)
304	tbb::parallel_sort(morton+current.begin(),morton+current.end());
305	#else
306	radixsort32(morton: morton+current.begin(),num: current.size());
307	#endif
308	}
309	}
310
311	__forceinline void split(const range<unsigned>& current, range<unsigned>& left, range<unsigned>& right) const
312	{
313	const unsigned int code_start = morton[current.begin()].code;
314	const unsigned int code_end = morton[current.end()-`1`].code;
315	unsigned int bitpos = lzcnt(x: code_start^code_end);
316
317	/ if all items mapped to same morton code, then re-create new morton codes for the items /
318	if (unlikely(bitpos == `32`))
319	{
320	recreateMortonCodes(current);
321	const unsigned int code_start = morton[current.begin()].code;
322	const unsigned int code_end = morton[current.end()-`1`].code;
323	bitpos = lzcnt(x: code_start^code_end);
324
325	/ if the morton code is still the same, goto fall back split /
326	if (unlikely(bitpos == `32`)) {
327	current.split(left_o&: left,right_o&: right);
328	return;
329	}
330	}
331
332	/ split the items at the topmost different morton code bit /
333	const unsigned int bitpos_diff = `31`-bitpos;
334	const unsigned int bitmask = `1` << bitpos_diff;
335
336	/ find location where bit differs using binary search /
337	unsigned begin = current.begin();
338	unsigned end = current.end();
339	while (begin + `1` != end) {
340	const unsigned mid = (begin+end)/`2`;
341	const unsigned bit = morton[mid].code & bitmask;
342	if (bit == `0`) begin = mid; else end = mid;
343	}
344	unsigned center = end;
345	#if defined(DEBUG)
346	for (unsigned int i=begin; i<center; i++) assert((morton[i].code & bitmask) == `0`);
347	for (unsigned int i=center; i<end; i++) assert((morton[i].code & bitmask) == bitmask);
348	#endif
349
350	left = make_range(begin: current.begin(),end: center);
351	right = make_range(begin: center,end: current.end());
352	}
353
354	ReductionTy recurse(size_t depth, const range<unsigned>& current, Allocator alloc, bool toplevel)
355	{
356	/ get thread local allocator /
357	if (!alloc)
358	alloc = createAllocator();
359
360	/ call memory monitor function to signal progress /
361	if (toplevel && current.size() <= singleThreadThreshold)
362	progressMonitor(current.size());
363
364	/ create leaf node /
365	if (unlikely(depth+MIN_LARGE_LEAF_LEVELS >= maxDepth \|\| current.size() <= minLeafSize))
366	return createLargeLeaf(depth,current,alloc);
367
368	/ fill all children by always splitting the one with the largest surface area /
369	range<unsigned> children[MAX_BRANCHING_FACTOR];
370	split(current,left&: children[`0`],right&: children[`1`]);
371	size_t numChildren = `2`;
372
373	while (numChildren < branchingFactor)
374	{
375	/ find best child with largest number of primitives /
376	int bestChild = -`1`;
377	unsigned bestItems = `0`;
378	for (unsigned int i=`0`; i<numChildren; i++)
379	{
380	/ ignore leaves as they cannot get split /
381	if (children[i].size() <= minLeafSize)
382	continue;
383
384	/ remember child with largest area /
385	if (children[i].size() > bestItems) {
386	bestItems = children[i].size();
387	bestChild = i;
388	}
389	}
390	if (bestChild == -`1`) break;
391
392	/! split best child into left and right child /
393	range<unsigned> left, right;
394	split(current: children[bestChild],left,right);
395
396	/ add new children left and right /
397	children[bestChild] = children[numChildren-`1`];
398	children[numChildren-`1`] = left;
399	children[numChildren+`0`] = right;
400	numChildren++;
401	}
402
403	/ create leaf node if no split is possible /
404	if (unlikely(numChildren == `1`))
405	return createLeaf(current,alloc);
406
407	/ allocate node /
408	auto node = createNode(alloc,numChildren);
409
410	/ process top parts of tree parallel /
411	ReductionTy bounds[MAX_BRANCHING_FACTOR];
412	if (current.size() > singleThreadThreshold)
413	{
414	/! parallel_for is faster than spawing sub-tasks /
415	parallel_for(size_t(`0`), numChildren, [&] (const range<size_t>& r) {
416	for (size_t i=r.begin(); i<r.end(); i++) {
417	bounds[i] = recurse(depth: depth+`1`,current: children[i],alloc: nullptr,toplevel: true);
418	_mm_mfence(); // to allow non-temporal stores during build
419	}
420	});
421	}
422
423	/ finish tree sequentially /
424	else
425	{
426	for (size_t i=`0`; i<numChildren; i++)
427	bounds[i] = recurse(depth: depth+`1`,current: children[i],alloc,toplevel: false);
428	}
429
430	return setBounds(node,bounds,numChildren);
431	}
432
433	/ build function /
434	ReductionTy build(BuildPrim* src, BuildPrim* tmp, size_t numPrimitives)
435	{
436	/ sort morton codes /
437	morton = src;
438	radix_sort_u32(src,tmp,numPrimitives,singleThreadThreshold);
439
440	/ build BVH /
441	const ReductionTy root = recurse(depth: `1`, current: range<unsigned>(`0`,(unsigned)numPrimitives), alloc: nullptr, toplevel: true);
442	_mm_mfence(); // to allow non-temporal stores during build
443	return root;
444	}
445
446	public:
447	CreateAllocator& createAllocator;
448	CreateNodeFunc& createNode;
449	SetNodeBoundsFunc& setBounds;
450	CreateLeafFunc& createLeaf;
451	CalculateBounds& calculateBounds;
452	ProgressMonitor& progressMonitor;
453
454	public:
455	BuildPrim* morton;
456	};
457
458
459	template<
460	typename ReductionTy,
461	typename CreateAllocFunc,
462	typename CreateNodeFunc,
463	typename SetBoundsFunc,
464	typename CreateLeafFunc,
465	typename CalculateBoundsFunc,
466	typename ProgressMonitor>
467
468	static ReductionTy build(CreateAllocFunc createAllocator,
469	CreateNodeFunc createNode,
470	SetBoundsFunc setBounds,
471	CreateLeafFunc createLeaf,
472	CalculateBoundsFunc calculateBounds,
473	ProgressMonitor progressMonitor,
474	BuildPrim* src,
475	BuildPrim* tmp,
476	size_t numPrimitives,
477	const Settings& settings)
478	{
479	typedef BuilderT<
480	ReductionTy,
481	decltype(createAllocator()),
482	CreateAllocFunc,
483	CreateNodeFunc,
484	SetBoundsFunc,
485	CreateLeafFunc,
486	CalculateBoundsFunc,
487	ProgressMonitor> Builder;
488
489	Builder builder(createAllocator,
490	createNode,
491	setBounds,
492	createLeaf,
493	calculateBounds,
494	progressMonitor,
495	settings);
496
497	return builder.build(src,tmp,numPrimitives);
498	}
499	};
500	}
501	}
502

source code of qtquick3d/src/3rdparty/embree/kernels/builders/bvh_builder_morton.h