1 | // Copyright 2009-2021 Intel Corporation |
2 | // SPDX-License-Identifier: Apache-2.0 |
3 | |
4 | #pragma once |
5 | |
6 | #include "../common/builder.h" |
7 | #include "../../common/algorithms/parallel_reduce.h" |
8 | |
9 | namespace embree |
10 | { |
11 | namespace isa |
12 | { |
13 | struct BVHBuilderMorton |
14 | { |
15 | static const size_t MAX_BRANCHING_FACTOR = 8; //!< maximum supported BVH branching factor |
16 | static const size_t MIN_LARGE_LEAF_LEVELS = 8; //!< create balanced tree of we are that many levels before the maximum tree depth |
17 | |
18 | /*! settings for morton builder */ |
19 | struct Settings |
20 | { |
21 | /*! default settings */ |
22 | Settings () |
23 | : branchingFactor(2), maxDepth(32), minLeafSize(1), maxLeafSize(7), singleThreadThreshold(1024) {} |
24 | |
25 | /*! initialize settings from API settings */ |
26 | Settings (const RTCBuildArguments& settings) |
27 | : branchingFactor(2), maxDepth(32), minLeafSize(1), maxLeafSize(7), singleThreadThreshold(1024) |
28 | { |
29 | if (RTC_BUILD_ARGUMENTS_HAS(settings,maxBranchingFactor)) branchingFactor = settings.maxBranchingFactor; |
30 | if (RTC_BUILD_ARGUMENTS_HAS(settings,maxDepth )) maxDepth = settings.maxDepth; |
31 | if (RTC_BUILD_ARGUMENTS_HAS(settings,minLeafSize )) minLeafSize = settings.minLeafSize; |
32 | if (RTC_BUILD_ARGUMENTS_HAS(settings,maxLeafSize )) maxLeafSize = settings.maxLeafSize; |
33 | |
34 | minLeafSize = min(a: minLeafSize,b: maxLeafSize); |
35 | } |
36 | |
37 | Settings (size_t branchingFactor, size_t maxDepth, size_t minLeafSize, size_t maxLeafSize, size_t singleThreadThreshold) |
38 | : branchingFactor(branchingFactor), maxDepth(maxDepth), minLeafSize(minLeafSize), maxLeafSize(maxLeafSize), singleThreadThreshold(singleThreadThreshold) |
39 | { |
40 | minLeafSize = min(a: minLeafSize,b: maxLeafSize); |
41 | } |
42 | |
43 | public: |
44 | size_t branchingFactor; //!< branching factor of BVH to build |
45 | size_t maxDepth; //!< maximum depth of BVH to build |
46 | size_t minLeafSize; //!< minimum size of a leaf |
47 | size_t maxLeafSize; //!< maximum size of a leaf |
48 | size_t singleThreadThreshold; //!< threshold when we switch to single threaded build |
49 | }; |
50 | |
51 | /*! Build primitive consisting of morton code and primitive ID. */ |
52 | struct __aligned(8) BuildPrim |
53 | { |
54 | union { |
55 | struct { |
56 | unsigned int code; //!< morton code |
57 | unsigned int index; //!< i'th primitive |
58 | }; |
59 | uint64_t t; |
60 | }; |
61 | |
62 | /*! interface for radix sort */ |
63 | __forceinline operator unsigned() const { return code; } |
64 | |
65 | /*! interface for standard sort */ |
66 | __forceinline bool operator<(const BuildPrim &m) const { return code < m.code; } |
67 | }; |
68 | |
69 | /*! maps bounding box to morton code */ |
70 | struct MortonCodeMapping |
71 | { |
72 | static const size_t LATTICE_BITS_PER_DIM = 10; |
73 | static const size_t LATTICE_SIZE_PER_DIM = size_t(1) << LATTICE_BITS_PER_DIM; |
74 | |
75 | vfloat4 base; |
76 | vfloat4 scale; |
77 | |
78 | __forceinline MortonCodeMapping(const BBox3fa& bounds) |
79 | { |
80 | base = (vfloat4)bounds.lower; |
81 | const vfloat4 diag = (vfloat4)bounds.upper - (vfloat4)bounds.lower; |
82 | scale = select(m: diag > vfloat4(1E-19f), t: rcp(a: diag) * vfloat4(LATTICE_SIZE_PER_DIM * 0.99f),f: vfloat4(0.0f)); |
83 | } |
84 | |
85 | __forceinline const vint4 bin (const BBox3fa& box) const |
86 | { |
87 | const vfloat4 lower = (vfloat4)box.lower; |
88 | const vfloat4 upper = (vfloat4)box.upper; |
89 | const vfloat4 centroid = lower+upper; |
90 | return vint4((centroid-base)*scale); |
91 | } |
92 | |
93 | __forceinline unsigned int code (const BBox3fa& box) const |
94 | { |
95 | const vint4 binID = bin(box); |
96 | const unsigned int x = extract<0>(b: binID); |
97 | const unsigned int y = extract<1>(b: binID); |
98 | const unsigned int z = extract<2>(b: binID); |
99 | const unsigned int xyz = bitInterleave(xin: x,yin: y,zin: z); |
100 | return xyz; |
101 | } |
102 | }; |
103 | |
104 | #if defined (__AVX2__) |
105 | |
106 | /*! for AVX2 there is a fast scalar bitInterleave */ |
107 | struct MortonCodeGenerator |
108 | { |
109 | __forceinline MortonCodeGenerator(const MortonCodeMapping& mapping, BuildPrim* dest) |
110 | : mapping(mapping), dest(dest) {} |
111 | |
112 | __forceinline void operator() (const BBox3fa& b, const unsigned index) |
113 | { |
114 | dest->index = index; |
115 | dest->code = mapping.code(b); |
116 | dest++; |
117 | } |
118 | |
119 | public: |
120 | const MortonCodeMapping mapping; |
121 | BuildPrim* dest; |
122 | size_t currentID; |
123 | }; |
124 | |
125 | #else |
126 | |
127 | /*! before AVX2 is it better to use the SSE version of bitInterleave */ |
128 | struct MortonCodeGenerator |
129 | { |
130 | __forceinline MortonCodeGenerator(const MortonCodeMapping& mapping, BuildPrim* dest) |
131 | : mapping(mapping), dest(dest), currentID(0), slots(0), ax(0), ay(0), az(0), ai(0) {} |
132 | |
133 | __forceinline ~MortonCodeGenerator() |
134 | { |
135 | if (slots != 0) |
136 | { |
137 | const vint4 code = bitInterleave(xin: ax,yin: ay,zin: az); |
138 | for (size_t i=0; i<slots; i++) { |
139 | dest[currentID-slots+i].index = ai[i]; |
140 | dest[currentID-slots+i].code = code[i]; |
141 | } |
142 | } |
143 | } |
144 | |
145 | __forceinline void operator() (const BBox3fa& b, const unsigned index) |
146 | { |
147 | const vint4 binID = mapping.bin(box: b); |
148 | ax[slots] = extract<0>(b: binID); |
149 | ay[slots] = extract<1>(b: binID); |
150 | az[slots] = extract<2>(b: binID); |
151 | ai[slots] = index; |
152 | slots++; |
153 | currentID++; |
154 | |
155 | if (slots == 4) |
156 | { |
157 | const vint4 code = bitInterleave(xin: ax,yin: ay,zin: az); |
158 | vint4::storeu(ptr: &dest[currentID-4],v: unpacklo(a: code,b: ai)); |
159 | vint4::storeu(ptr: &dest[currentID-2],v: unpackhi(a: code,b: ai)); |
160 | slots = 0; |
161 | } |
162 | } |
163 | |
164 | public: |
165 | const MortonCodeMapping mapping; |
166 | BuildPrim* dest; |
167 | size_t currentID; |
168 | size_t slots; |
169 | vint4 ax, ay, az, ai; |
170 | }; |
171 | |
172 | #endif |
173 | |
174 | template< |
175 | typename ReductionTy, |
176 | typename Allocator, |
177 | typename CreateAllocator, |
178 | typename CreateNodeFunc, |
179 | typename SetNodeBoundsFunc, |
180 | typename CreateLeafFunc, |
181 | typename CalculateBounds, |
182 | typename ProgressMonitor> |
183 | |
184 | class BuilderT : private Settings |
185 | { |
186 | ALIGNED_CLASS_(16); |
187 | |
188 | public: |
189 | |
190 | BuilderT (CreateAllocator& createAllocator, |
191 | CreateNodeFunc& createNode, |
192 | SetNodeBoundsFunc& setBounds, |
193 | CreateLeafFunc& createLeaf, |
194 | CalculateBounds& calculateBounds, |
195 | ProgressMonitor& progressMonitor, |
196 | const Settings& settings) |
197 | |
198 | : Settings(settings), |
199 | createAllocator(createAllocator), |
200 | createNode(createNode), |
201 | setBounds(setBounds), |
202 | createLeaf(createLeaf), |
203 | calculateBounds(calculateBounds), |
204 | progressMonitor(progressMonitor), |
205 | morton(nullptr) {} |
206 | |
207 | ReductionTy createLargeLeaf(size_t depth, const range<unsigned>& current, Allocator alloc) |
208 | { |
209 | /* this should never occur but is a fatal error */ |
210 | if (depth > maxDepth) |
211 | throw_RTCError(RTC_ERROR_UNKNOWN,"depth limit reached" ); |
212 | |
213 | /* create leaf for few primitives */ |
214 | if (current.size() <= maxLeafSize) |
215 | return createLeaf(current,alloc); |
216 | |
217 | /* fill all children by always splitting the largest one */ |
218 | range<unsigned> children[MAX_BRANCHING_FACTOR]; |
219 | size_t numChildren = 1; |
220 | children[0] = current; |
221 | |
222 | do { |
223 | |
224 | /* find best child with largest number of primitives */ |
225 | size_t bestChild = -1; |
226 | size_t bestSize = 0; |
227 | for (size_t i=0; i<numChildren; i++) |
228 | { |
229 | /* ignore leaves as they cannot get split */ |
230 | if (children[i].size() <= maxLeafSize) |
231 | continue; |
232 | |
233 | /* remember child with largest size */ |
234 | if (children[i].size() > bestSize) { |
235 | bestSize = children[i].size(); |
236 | bestChild = i; |
237 | } |
238 | } |
239 | if (bestChild == size_t(-1)) break; |
240 | |
241 | /*! split best child into left and right child */ |
242 | auto split = children[bestChild].split(); |
243 | |
244 | /* add new children left and right */ |
245 | children[bestChild] = children[numChildren-1]; |
246 | children[numChildren-1] = split.first; |
247 | children[numChildren+0] = split.second; |
248 | numChildren++; |
249 | |
250 | } while (numChildren < branchingFactor); |
251 | |
252 | /* create node */ |
253 | auto node = createNode(alloc,numChildren); |
254 | |
255 | /* recurse into each child */ |
256 | ReductionTy bounds[MAX_BRANCHING_FACTOR]; |
257 | for (size_t i=0; i<numChildren; i++) |
258 | bounds[i] = createLargeLeaf(depth: depth+1,current: children[i],alloc); |
259 | |
260 | return setBounds(node,bounds,numChildren); |
261 | } |
262 | |
263 | /*! recreates morton codes when reaching a region where all codes are identical */ |
264 | __noinline void recreateMortonCodes(const range<unsigned>& current) const |
265 | { |
266 | /* fast path for small ranges */ |
267 | if (likely(current.size() < 1024)) |
268 | { |
269 | /*! recalculate centroid bounds */ |
270 | BBox3fa centBounds(empty); |
271 | for (size_t i=current.begin(); i<current.end(); i++) |
272 | centBounds.extend(center2(calculateBounds(morton[i]))); |
273 | |
274 | /* recalculate morton codes */ |
275 | MortonCodeMapping mapping(centBounds); |
276 | for (size_t i=current.begin(); i<current.end(); i++) |
277 | morton[i].code = mapping.code(box: calculateBounds(morton[i])); |
278 | |
279 | /* sort morton codes */ |
280 | std::sort(first: morton+current.begin(),last: morton+current.end()); |
281 | } |
282 | else |
283 | { |
284 | /*! recalculate centroid bounds */ |
285 | auto calculateCentBounds = [&] ( const range<unsigned>& r ) { |
286 | BBox3fa centBounds = empty; |
287 | for (size_t i=r.begin(); i<r.end(); i++) |
288 | centBounds.extend(center2(calculateBounds(morton[i]))); |
289 | return centBounds; |
290 | }; |
291 | const BBox3fa centBounds = parallel_reduce(current.begin(), current.end(), unsigned(1024), |
292 | BBox3fa(empty), calculateCentBounds, BBox3fa::merge); |
293 | |
294 | /* recalculate morton codes */ |
295 | MortonCodeMapping mapping(centBounds); |
296 | parallel_for(current.begin(), current.end(), unsigned(1024), [&] ( const range<unsigned>& r ) { |
297 | for (size_t i=r.begin(); i<r.end(); i++) { |
298 | morton[i].code = mapping.code(box: calculateBounds(morton[i])); |
299 | } |
300 | }); |
301 | |
302 | /*! sort morton codes */ |
303 | #if defined(TASKING_TBB) |
304 | tbb::parallel_sort(morton+current.begin(),morton+current.end()); |
305 | #else |
306 | radixsort32(morton: morton+current.begin(),num: current.size()); |
307 | #endif |
308 | } |
309 | } |
310 | |
311 | __forceinline void split(const range<unsigned>& current, range<unsigned>& left, range<unsigned>& right) const |
312 | { |
313 | const unsigned int code_start = morton[current.begin()].code; |
314 | const unsigned int code_end = morton[current.end()-1].code; |
315 | unsigned int bitpos = lzcnt(x: code_start^code_end); |
316 | |
317 | /* if all items mapped to same morton code, then re-create new morton codes for the items */ |
318 | if (unlikely(bitpos == 32)) |
319 | { |
320 | recreateMortonCodes(current); |
321 | const unsigned int code_start = morton[current.begin()].code; |
322 | const unsigned int code_end = morton[current.end()-1].code; |
323 | bitpos = lzcnt(x: code_start^code_end); |
324 | |
325 | /* if the morton code is still the same, goto fall back split */ |
326 | if (unlikely(bitpos == 32)) { |
327 | current.split(left_o&: left,right_o&: right); |
328 | return; |
329 | } |
330 | } |
331 | |
332 | /* split the items at the topmost different morton code bit */ |
333 | const unsigned int bitpos_diff = 31-bitpos; |
334 | const unsigned int bitmask = 1 << bitpos_diff; |
335 | |
336 | /* find location where bit differs using binary search */ |
337 | unsigned begin = current.begin(); |
338 | unsigned end = current.end(); |
339 | while (begin + 1 != end) { |
340 | const unsigned mid = (begin+end)/2; |
341 | const unsigned bit = morton[mid].code & bitmask; |
342 | if (bit == 0) begin = mid; else end = mid; |
343 | } |
344 | unsigned center = end; |
345 | #if defined(DEBUG) |
346 | for (unsigned int i=begin; i<center; i++) assert((morton[i].code & bitmask) == 0); |
347 | for (unsigned int i=center; i<end; i++) assert((morton[i].code & bitmask) == bitmask); |
348 | #endif |
349 | |
350 | left = make_range(begin: current.begin(),end: center); |
351 | right = make_range(begin: center,end: current.end()); |
352 | } |
353 | |
354 | ReductionTy recurse(size_t depth, const range<unsigned>& current, Allocator alloc, bool toplevel) |
355 | { |
356 | /* get thread local allocator */ |
357 | if (!alloc) |
358 | alloc = createAllocator(); |
359 | |
360 | /* call memory monitor function to signal progress */ |
361 | if (toplevel && current.size() <= singleThreadThreshold) |
362 | progressMonitor(current.size()); |
363 | |
364 | /* create leaf node */ |
365 | if (unlikely(depth+MIN_LARGE_LEAF_LEVELS >= maxDepth || current.size() <= minLeafSize)) |
366 | return createLargeLeaf(depth,current,alloc); |
367 | |
368 | /* fill all children by always splitting the one with the largest surface area */ |
369 | range<unsigned> children[MAX_BRANCHING_FACTOR]; |
370 | split(current,left&: children[0],right&: children[1]); |
371 | size_t numChildren = 2; |
372 | |
373 | while (numChildren < branchingFactor) |
374 | { |
375 | /* find best child with largest number of primitives */ |
376 | int bestChild = -1; |
377 | unsigned bestItems = 0; |
378 | for (unsigned int i=0; i<numChildren; i++) |
379 | { |
380 | /* ignore leaves as they cannot get split */ |
381 | if (children[i].size() <= minLeafSize) |
382 | continue; |
383 | |
384 | /* remember child with largest area */ |
385 | if (children[i].size() > bestItems) { |
386 | bestItems = children[i].size(); |
387 | bestChild = i; |
388 | } |
389 | } |
390 | if (bestChild == -1) break; |
391 | |
392 | /*! split best child into left and right child */ |
393 | range<unsigned> left, right; |
394 | split(current: children[bestChild],left,right); |
395 | |
396 | /* add new children left and right */ |
397 | children[bestChild] = children[numChildren-1]; |
398 | children[numChildren-1] = left; |
399 | children[numChildren+0] = right; |
400 | numChildren++; |
401 | } |
402 | |
403 | /* create leaf node if no split is possible */ |
404 | if (unlikely(numChildren == 1)) |
405 | return createLeaf(current,alloc); |
406 | |
407 | /* allocate node */ |
408 | auto node = createNode(alloc,numChildren); |
409 | |
410 | /* process top parts of tree parallel */ |
411 | ReductionTy bounds[MAX_BRANCHING_FACTOR]; |
412 | if (current.size() > singleThreadThreshold) |
413 | { |
414 | /*! parallel_for is faster than spawing sub-tasks */ |
415 | parallel_for(size_t(0), numChildren, [&] (const range<size_t>& r) { |
416 | for (size_t i=r.begin(); i<r.end(); i++) { |
417 | bounds[i] = recurse(depth: depth+1,current: children[i],alloc: nullptr,toplevel: true); |
418 | _mm_mfence(); // to allow non-temporal stores during build |
419 | } |
420 | }); |
421 | } |
422 | |
423 | /* finish tree sequentially */ |
424 | else |
425 | { |
426 | for (size_t i=0; i<numChildren; i++) |
427 | bounds[i] = recurse(depth: depth+1,current: children[i],alloc,toplevel: false); |
428 | } |
429 | |
430 | return setBounds(node,bounds,numChildren); |
431 | } |
432 | |
433 | /* build function */ |
434 | ReductionTy build(BuildPrim* src, BuildPrim* tmp, size_t numPrimitives) |
435 | { |
436 | /* sort morton codes */ |
437 | morton = src; |
438 | radix_sort_u32(src,tmp,numPrimitives,singleThreadThreshold); |
439 | |
440 | /* build BVH */ |
441 | const ReductionTy root = recurse(depth: 1, current: range<unsigned>(0,(unsigned)numPrimitives), alloc: nullptr, toplevel: true); |
442 | _mm_mfence(); // to allow non-temporal stores during build |
443 | return root; |
444 | } |
445 | |
446 | public: |
447 | CreateAllocator& createAllocator; |
448 | CreateNodeFunc& createNode; |
449 | SetNodeBoundsFunc& setBounds; |
450 | CreateLeafFunc& createLeaf; |
451 | CalculateBounds& calculateBounds; |
452 | ProgressMonitor& progressMonitor; |
453 | |
454 | public: |
455 | BuildPrim* morton; |
456 | }; |
457 | |
458 | |
459 | template< |
460 | typename ReductionTy, |
461 | typename CreateAllocFunc, |
462 | typename CreateNodeFunc, |
463 | typename SetBoundsFunc, |
464 | typename CreateLeafFunc, |
465 | typename CalculateBoundsFunc, |
466 | typename ProgressMonitor> |
467 | |
468 | static ReductionTy build(CreateAllocFunc createAllocator, |
469 | CreateNodeFunc createNode, |
470 | SetBoundsFunc setBounds, |
471 | CreateLeafFunc createLeaf, |
472 | CalculateBoundsFunc calculateBounds, |
473 | ProgressMonitor progressMonitor, |
474 | BuildPrim* src, |
475 | BuildPrim* tmp, |
476 | size_t numPrimitives, |
477 | const Settings& settings) |
478 | { |
479 | typedef BuilderT< |
480 | ReductionTy, |
481 | decltype(createAllocator()), |
482 | CreateAllocFunc, |
483 | CreateNodeFunc, |
484 | SetBoundsFunc, |
485 | CreateLeafFunc, |
486 | CalculateBoundsFunc, |
487 | ProgressMonitor> Builder; |
488 | |
489 | Builder builder(createAllocator, |
490 | createNode, |
491 | setBounds, |
492 | createLeaf, |
493 | calculateBounds, |
494 | progressMonitor, |
495 | settings); |
496 | |
497 | return builder.build(src,tmp,numPrimitives); |
498 | } |
499 | }; |
500 | } |
501 | } |
502 | |