1 | // Copyright 2009-2021 Intel Corporation |
2 | // SPDX-License-Identifier: Apache-2.0 |
3 | |
4 | #pragma once |
5 | |
6 | #include "../builders/primrefgen.h" |
7 | #include "../builders/heuristic_spatial.h" |
8 | #include "../builders/splitter.h" |
9 | |
10 | #include "../../common/algorithms/parallel_for_for.h" |
11 | #include "../../common/algorithms/parallel_for_for_prefix_sum.h" |
12 | |
13 | #define DBG_PRESPLIT(x) |
14 | #define CHECK_PRESPLIT(x) |
15 | |
16 | #define GRID_SIZE 1024 |
17 | #define MAX_PRESPLITS_PER_PRIMITIVE_LOG 5 |
18 | #define MAX_PRESPLITS_PER_PRIMITIVE (1<<MAX_PRESPLITS_PER_PRIMITIVE_LOG) |
19 | #define PRIORITY_CUTOFF_THRESHOLD 1.0f |
20 | #define PRIORITY_SPLIT_POS_WEIGHT 1.5f |
21 | |
22 | namespace embree |
23 | { |
24 | namespace isa |
25 | { |
26 | |
27 | struct PresplitItem |
28 | { |
29 | union { |
30 | float priority; |
31 | unsigned int data; |
32 | }; |
33 | unsigned int index; |
34 | |
35 | __forceinline operator unsigned() const |
36 | { |
37 | return reinterpret_cast<const unsigned&>(priority); |
38 | } |
39 | __forceinline bool operator < (const PresplitItem& item) const |
40 | { |
41 | return (priority < item.priority); |
42 | } |
43 | |
44 | template<typename Mesh> |
45 | __forceinline static float compute_priority(const PrimRef &ref, Scene *scene, const Vec2i &mc) |
46 | { |
47 | const unsigned int geomID = ref.geomID(); |
48 | const unsigned int primID = ref.primID(); |
49 | const float area_aabb = area(b: ref.bounds()); |
50 | const float area_prim = ((Mesh*)scene->get(i: geomID))->projectedPrimitiveArea(primID); |
51 | const unsigned int diff = 31 - lzcnt(x: mc.x^mc.y); |
52 | assert(area_prim <= area_aabb); |
53 | //const float priority = powf((area_aabb - area_prim) * powf(PRIORITY_SPLIT_POS_WEIGHT,(float)diff),1.0f/4.0f); |
54 | const float priority = sqrtf(x: sqrtf( x: (area_aabb - area_prim) * powf(PRIORITY_SPLIT_POS_WEIGHT,y: (float)diff) )); |
55 | assert(priority >= 0.0f && priority < FLT_LARGE); |
56 | return priority; |
57 | } |
58 | |
59 | |
60 | }; |
61 | |
62 | inline std::ostream &operator<<(std::ostream &cout, const PresplitItem& item) { |
63 | return cout << "index " << item.index << " priority " << item.priority; |
64 | }; |
65 | |
66 | template<typename SplitterFactory> |
67 | void splitPrimitive(SplitterFactory &Splitter, |
68 | const PrimRef &prim, |
69 | const unsigned int geomID, |
70 | const unsigned int primID, |
71 | const unsigned int split_level, |
72 | const Vec3fa &grid_base, |
73 | const float grid_scale, |
74 | const float grid_extend, |
75 | PrimRef subPrims[MAX_PRESPLITS_PER_PRIMITIVE], |
76 | unsigned int& numSubPrims) |
77 | { |
78 | assert(split_level <= MAX_PRESPLITS_PER_PRIMITIVE_LOG); |
79 | if (split_level == 0) |
80 | { |
81 | assert(numSubPrims < MAX_PRESPLITS_PER_PRIMITIVE); |
82 | subPrims[numSubPrims++] = prim; |
83 | } |
84 | else |
85 | { |
86 | const Vec3fa lower = prim.lower; |
87 | const Vec3fa upper = prim.upper; |
88 | const Vec3fa glower = (lower-grid_base)*Vec3fa(grid_scale)+Vec3fa(0.2f); |
89 | const Vec3fa gupper = (upper-grid_base)*Vec3fa(grid_scale)-Vec3fa(0.2f); |
90 | Vec3ia ilower(floor(a: glower)); |
91 | Vec3ia iupper(floor(a: gupper)); |
92 | |
93 | /* this ignores dimensions that are empty */ |
94 | iupper = (Vec3ia)(select(m: vint4(glower) >= vint4(gupper),t: vint4(ilower),f: vint4(iupper))); |
95 | |
96 | /* compute a morton code for the lower and upper grid coordinates. */ |
97 | const unsigned int lower_code = bitInterleave(xin: ilower.x,yin: ilower.y,zin: ilower.z); |
98 | const unsigned int upper_code = bitInterleave(xin: iupper.x,yin: iupper.y,zin: iupper.z); |
99 | |
100 | /* if all bits are equal then we cannot split */ |
101 | if(unlikely(lower_code == upper_code)) |
102 | { |
103 | assert(numSubPrims < MAX_PRESPLITS_PER_PRIMITIVE); |
104 | subPrims[numSubPrims++] = prim; |
105 | return; |
106 | } |
107 | |
108 | /* compute octree level and dimension to perform the split in */ |
109 | const unsigned int diff = 31 - lzcnt(x: lower_code^upper_code); |
110 | const unsigned int level = diff / 3; |
111 | const unsigned int dim = diff % 3; |
112 | |
113 | /* now we compute the grid position of the split */ |
114 | const unsigned int isplit = iupper[dim] & ~((1<<level)-1); |
115 | |
116 | /* compute world space position of split */ |
117 | const float inv_grid_size = 1.0f / GRID_SIZE; |
118 | const float fsplit = grid_base[dim] + isplit * inv_grid_size * grid_extend; |
119 | |
120 | assert(prim.lower[dim] <= fsplit && |
121 | prim.upper[dim] >= fsplit); |
122 | |
123 | /* split primitive */ |
124 | const auto splitter = Splitter(prim); |
125 | BBox3fa left,right; |
126 | splitter(prim.bounds(),dim,fsplit,left,right); |
127 | assert(!left.empty()); |
128 | assert(!right.empty()); |
129 | |
130 | |
131 | splitPrimitive(Splitter,PrimRef(left ,geomID,primID),geomID,primID,split_level-1,grid_base,grid_scale,grid_extend,subPrims,numSubPrims); |
132 | splitPrimitive(Splitter,PrimRef(right,geomID,primID),geomID,primID,split_level-1,grid_base,grid_scale,grid_extend,subPrims,numSubPrims); |
133 | } |
134 | } |
135 | |
136 | |
137 | template<typename Mesh, typename SplitterFactory> |
138 | PrimInfo createPrimRefArray_presplit(Geometry* geometry, unsigned int geomID, size_t numPrimRefs, mvector<PrimRef>& prims, BuildProgressMonitor& progressMonitor) |
139 | { |
140 | ParallelPrefixSumState<PrimInfo> pstate; |
141 | |
142 | /* first try */ |
143 | progressMonitor(0); |
144 | PrimInfo pinfo = parallel_prefix_sum( pstate, size_t(0), geometry->size(), size_t(1024), PrimInfo(empty), [&](const range<size_t>& r, const PrimInfo& base) -> PrimInfo { |
145 | return geometry->createPrimRefArray(prims,r,k: r.begin(),geomID); |
146 | }, [](const PrimInfo& a, const PrimInfo& b) -> PrimInfo { return PrimInfo::merge(a,b); }); |
147 | |
148 | /* if we need to filter out geometry, run again */ |
149 | if (pinfo.size() != numPrimRefs) |
150 | { |
151 | progressMonitor(0); |
152 | pinfo = parallel_prefix_sum( pstate, size_t(0), geometry->size(), size_t(1024), PrimInfo(empty), [&](const range<size_t>& r, const PrimInfo& base) -> PrimInfo { |
153 | return geometry->createPrimRefArray(prims,r,k: base.size(),geomID); |
154 | }, [](const PrimInfo& a, const PrimInfo& b) -> PrimInfo { return PrimInfo::merge(a,b); }); |
155 | } |
156 | return pinfo; |
157 | } |
158 | |
159 | __forceinline Vec2i computeMC(const Vec3fa &grid_base, const float grid_scale, const PrimRef &ref) |
160 | { |
161 | const Vec3fa lower = ref.lower; |
162 | const Vec3fa upper = ref.upper; |
163 | const Vec3fa glower = (lower-grid_base)*Vec3fa(grid_scale)+Vec3fa(0.2f); |
164 | const Vec3fa gupper = (upper-grid_base)*Vec3fa(grid_scale)-Vec3fa(0.2f); |
165 | Vec3ia ilower(floor(a: glower)); |
166 | Vec3ia iupper(floor(a: gupper)); |
167 | |
168 | /* this ignores dimensions that are empty */ |
169 | iupper = (Vec3ia)select(m: vint4(glower) >= vint4(gupper),t: vint4(ilower),f: vint4(iupper)); |
170 | |
171 | /* compute a morton code for the lower and upper grid coordinates. */ |
172 | const unsigned int lower_code = bitInterleave(xin: ilower.x,yin: ilower.y,zin: ilower.z); |
173 | const unsigned int upper_code = bitInterleave(xin: iupper.x,yin: iupper.y,zin: iupper.z); |
174 | return Vec2i(lower_code,upper_code); |
175 | } |
176 | |
177 | template<typename Mesh, typename SplitterFactory> |
178 | PrimInfo createPrimRefArray_presplit(Scene* scene, Geometry::GTypeMask types, bool mblur, size_t numPrimRefs, mvector<PrimRef>& prims, BuildProgressMonitor& progressMonitor) |
179 | { |
180 | static const size_t MIN_STEP_SIZE = 128; |
181 | |
182 | ParallelForForPrefixSumState<PrimInfo> pstate; |
183 | Scene::Iterator2 iter(scene,types,mblur); |
184 | |
185 | /* first try */ |
186 | progressMonitor(0); |
187 | pstate.init(array2&: iter,minStepSize: size_t(1024)); |
188 | PrimInfo pinfo = parallel_for_for_prefix_sum0( pstate, iter, PrimInfo(empty), [&](Geometry* mesh, const range<size_t>& r, size_t k, size_t geomID) -> PrimInfo { |
189 | return mesh->createPrimRefArray(prims,r,k,geomID: (unsigned)geomID); |
190 | }, [](const PrimInfo& a, const PrimInfo& b) -> PrimInfo { return PrimInfo::merge(a,b); }); |
191 | |
192 | /* if we need to filter out geometry, run again */ |
193 | if (pinfo.size() != numPrimRefs) |
194 | { |
195 | progressMonitor(0); |
196 | pinfo = parallel_for_for_prefix_sum1( pstate, iter, PrimInfo(empty), [&](Geometry* mesh, const range<size_t>& r, size_t k, size_t geomID, const PrimInfo& base) -> PrimInfo { |
197 | return mesh->createPrimRefArray(prims,r,k: base.size(),geomID: (unsigned)geomID); |
198 | }, [](const PrimInfo& a, const PrimInfo& b) -> PrimInfo { return PrimInfo::merge(a,b); }); |
199 | } |
200 | |
201 | /* use correct number of primitives */ |
202 | size_t numPrimitives = pinfo.size(); |
203 | const size_t alloc_numPrimitives = prims.size(); |
204 | const size_t numSplitPrimitivesBudget = alloc_numPrimitives - numPrimitives; |
205 | |
206 | /* set up primitive splitter */ |
207 | SplitterFactory Splitter(scene); |
208 | |
209 | |
210 | DBG_PRESPLIT( |
211 | const size_t org_numPrimitives = pinfo.size(); |
212 | PRINT(numPrimitives); |
213 | PRINT(alloc_numPrimitives); |
214 | PRINT(numSplitPrimitivesBudget); |
215 | ); |
216 | |
217 | /* allocate double buffer presplit items */ |
218 | const size_t presplit_allocation_size = sizeof(PresplitItem)*alloc_numPrimitives; |
219 | PresplitItem *presplitItem = (PresplitItem*)alignedMalloc(size: presplit_allocation_size,align: 64); |
220 | PresplitItem *tmp_presplitItem = (PresplitItem*)alignedMalloc(size: presplit_allocation_size,align: 64); |
221 | |
222 | /* compute grid */ |
223 | const Vec3fa grid_base = pinfo.geomBounds.lower; |
224 | const Vec3fa grid_diag = pinfo.geomBounds.size(); |
225 | const float grid_extend = max(a: grid_diag.x,b: max(a: grid_diag.y,b: grid_diag.z)); |
226 | const float grid_scale = grid_extend == 0.0f ? 0.0f : GRID_SIZE / grid_extend; |
227 | |
228 | /* init presplit items and get total sum */ |
229 | const float psum = parallel_reduce( size_t(0), numPrimitives, size_t(MIN_STEP_SIZE), 0.0f, [&](const range<size_t>& r) -> float { |
230 | float sum = 0.0f; |
231 | for (size_t i=r.begin(); i<r.end(); i++) |
232 | { |
233 | presplitItem[i].index = (unsigned int)i; |
234 | const Vec2i mc = computeMC(grid_base,grid_scale,ref: prims[i]); |
235 | /* if all bits are equal then we cannot split */ |
236 | presplitItem[i].priority = (mc.x != mc.y) ? PresplitItem::compute_priority<Mesh>(prims[i],scene,mc) : 0.0f; |
237 | /* FIXME: sum undeterministic */ |
238 | sum += presplitItem[i].priority; |
239 | } |
240 | return sum; |
241 | },[](const float& a, const float& b) -> float { return a+b; }); |
242 | |
243 | /* compute number of splits per primitive */ |
244 | const float inv_psum = 1.0f / psum; |
245 | parallel_for( size_t(0), numPrimitives, size_t(MIN_STEP_SIZE), [&](const range<size_t>& r) -> void { |
246 | for (size_t i=r.begin(); i<r.end(); i++) |
247 | { |
248 | if (presplitItem[i].priority > 0.0f) |
249 | { |
250 | const float rel_p = (float)numSplitPrimitivesBudget * presplitItem[i].priority * inv_psum; |
251 | if (rel_p >= PRIORITY_CUTOFF_THRESHOLD) // need at least a split budget that generates two sub-prims |
252 | { |
253 | presplitItem[i].priority = max(a: min(a: ceilf(x: logf(x: rel_p)/logf(x: 2.0f)),b: (float)MAX_PRESPLITS_PER_PRIMITIVE_LOG),b: 1.0f); |
254 | //presplitItem[i].priority = min(floorf(logf(rel_p)/logf(2.0f)),(float)MAX_PRESPLITS_PER_PRIMITIVE_LOG); |
255 | assert(presplitItem[i].priority >= 0.0f && presplitItem[i].priority <= (float)MAX_PRESPLITS_PER_PRIMITIVE_LOG); |
256 | } |
257 | else |
258 | presplitItem[i].priority = 0.0f; |
259 | } |
260 | } |
261 | }); |
262 | |
263 | auto isLeft = [&] (const PresplitItem &ref) { return ref.priority < PRIORITY_CUTOFF_THRESHOLD; }; |
264 | size_t center = parallel_partitioning(presplitItem,0,numPrimitives,isLeft,1024); |
265 | |
266 | /* anything to split ? */ |
267 | if (center < numPrimitives) |
268 | { |
269 | size_t numPrimitivesToSplit = numPrimitives - center; |
270 | assert(presplitItem[center].priority >= 1.0f); |
271 | |
272 | /* sort presplit items in ascending order */ |
273 | radix_sort_u32(src: presplitItem + center,tmp: tmp_presplitItem + center,N: numPrimitivesToSplit,blockSize: 1024); |
274 | |
275 | CHECK_PRESPLIT( |
276 | parallel_for( size_t(center+1), numPrimitives, size_t(MIN_STEP_SIZE), [&](const range<size_t>& r) -> void { |
277 | for (size_t i=r.begin(); i<r.end(); i++) |
278 | assert(presplitItem[i-1].priority <= presplitItem[i].priority); |
279 | }); |
280 | ); |
281 | |
282 | unsigned int* primOffset0 = (unsigned int*)tmp_presplitItem; |
283 | unsigned int* primOffset1 = (unsigned int*)tmp_presplitItem + numPrimitivesToSplit; |
284 | |
285 | /* compute actual number of sub-primitives generated within the [center;numPrimitives-1] range */ |
286 | const size_t totalNumSubPrims = parallel_reduce( size_t(center), numPrimitives, size_t(MIN_STEP_SIZE), size_t(0), [&](const range<size_t>& t) -> size_t { |
287 | size_t sum = 0; |
288 | for (size_t i=t.begin(); i<t.end(); i++) |
289 | { |
290 | PrimRef subPrims[MAX_PRESPLITS_PER_PRIMITIVE]; |
291 | assert(presplitItem[i].priority >= 1.0f); |
292 | const unsigned int primrefID = presplitItem[i].index; |
293 | const float prio = presplitItem[i].priority; |
294 | const unsigned int geomID = prims[primrefID].geomID(); |
295 | const unsigned int primID = prims[primrefID].primID(); |
296 | const unsigned int split_levels = (unsigned int)prio; |
297 | unsigned int numSubPrims = 0; |
298 | splitPrimitive(Splitter,prims[primrefID],geomID,primID,split_levels,grid_base,grid_scale,grid_extend,subPrims,numSubPrims); |
299 | assert(numSubPrims); |
300 | numSubPrims--; // can reuse slot |
301 | sum+=numSubPrims; |
302 | presplitItem[i].data = (numSubPrims << MAX_PRESPLITS_PER_PRIMITIVE_LOG) | split_levels; |
303 | primOffset0[i-center] = numSubPrims; |
304 | } |
305 | return sum; |
306 | },[](const size_t& a, const size_t& b) -> size_t { return a+b; }); |
307 | |
308 | /* if we are over budget, need to shrink the range */ |
309 | if (totalNumSubPrims > numSplitPrimitivesBudget) |
310 | { |
311 | size_t new_center = numPrimitives-1; |
312 | size_t sum = 0; |
313 | for (;new_center>=center;new_center--) |
314 | { |
315 | const unsigned int numSubPrims = presplitItem[new_center].data >> MAX_PRESPLITS_PER_PRIMITIVE_LOG; |
316 | if (unlikely(sum + numSubPrims >= numSplitPrimitivesBudget)) break; |
317 | sum += numSubPrims; |
318 | } |
319 | new_center++; |
320 | |
321 | primOffset0 += new_center - center; |
322 | numPrimitivesToSplit -= new_center - center; |
323 | center = new_center; |
324 | assert(numPrimitivesToSplit == (numPrimitives - center)); |
325 | } |
326 | |
327 | /* parallel prefix sum to compute offsets for storing sub-primitives */ |
328 | const unsigned int offset = parallel_prefix_sum(src: primOffset0,dst&: primOffset1,N: numPrimitivesToSplit,identity: (unsigned int)0,add: std::plus<unsigned int>()); |
329 | assert(numPrimitives+offset <= alloc_numPrimitives); |
330 | |
331 | /* iterate over range, and split primitives into sub primitives and append them to prims array */ |
332 | parallel_for( size_t(center), numPrimitives, size_t(MIN_STEP_SIZE), [&](const range<size_t>& rn) -> void { |
333 | for (size_t j=rn.begin(); j<rn.end(); j++) |
334 | { |
335 | PrimRef subPrims[MAX_PRESPLITS_PER_PRIMITIVE]; |
336 | const unsigned int primrefID = presplitItem[j].index; |
337 | const unsigned int geomID = prims[primrefID].geomID(); |
338 | const unsigned int primID = prims[primrefID].primID(); |
339 | const unsigned int split_levels = presplitItem[j].data & ((unsigned int)(1 << MAX_PRESPLITS_PER_PRIMITIVE_LOG)-1); |
340 | |
341 | assert(split_levels); |
342 | assert(split_levels <= MAX_PRESPLITS_PER_PRIMITIVE_LOG); |
343 | unsigned int numSubPrims = 0; |
344 | splitPrimitive(Splitter,prims[primrefID],geomID,primID,split_levels,grid_base,grid_scale,grid_extend,subPrims,numSubPrims); |
345 | const size_t newID = numPrimitives + primOffset1[j-center]; |
346 | assert(newID+numSubPrims-1 <= alloc_numPrimitives); |
347 | prims[primrefID] = subPrims[0]; |
348 | for (size_t i=1;i<numSubPrims;i++) |
349 | prims[newID+i-1] = subPrims[i]; |
350 | } |
351 | }); |
352 | |
353 | numPrimitives += offset; |
354 | DBG_PRESPLIT( |
355 | PRINT(pinfo.size()); |
356 | PRINT(numPrimitives); |
357 | PRINT((float)numPrimitives/org_numPrimitives)); |
358 | } |
359 | |
360 | /* recompute centroid bounding boxes */ |
361 | pinfo = parallel_reduce(size_t(0),numPrimitives,size_t(MIN_STEP_SIZE),PrimInfo(empty),[&] (const range<size_t>& r) -> PrimInfo { |
362 | PrimInfo p(empty); |
363 | for (size_t j=r.begin(); j<r.end(); j++) |
364 | p.add_center2(prim: prims[j]); |
365 | return p; |
366 | }, [](const PrimInfo& a, const PrimInfo& b) -> PrimInfo { return PrimInfo::merge(a,b); }); |
367 | |
368 | assert(pinfo.size() == numPrimitives); |
369 | |
370 | /* free double buffer presplit items */ |
371 | alignedFree(ptr: tmp_presplitItem); |
372 | alignedFree(ptr: presplitItem); |
373 | return pinfo; |
374 | } |
375 | } |
376 | } |
377 | |