1 | // Copyright 2009-2021 Intel Corporation |
2 | // SPDX-License-Identifier: Apache-2.0 |
3 | |
4 | #pragma once |
5 | |
6 | #include "bvh.h" |
7 | #include "node_intersector1.h" |
8 | #include "../common/stack_item.h" |
9 | |
10 | #define NEW_SORTING_CODE 1 |
11 | |
12 | namespace embree |
13 | { |
14 | namespace isa |
15 | { |
16 | /*! BVH regular node traversal for single rays. */ |
17 | template<int N, int types> |
18 | class BVHNNodeTraverser1Hit; |
19 | |
20 | #if defined(__AVX512VL__) // SKX |
21 | |
22 | template<int N> |
23 | __forceinline void isort_update(vint<N> &dist, const vint<N> &d) |
24 | { |
25 | const vint<N> dist_shift = align_shift_right<N-1>(dist,dist); |
26 | const vboolf<N> m_geq = d >= dist; |
27 | const vboolf<N> m_geq_shift = m_geq << 1; |
28 | dist = select(m_geq,d,dist); |
29 | dist = select(m_geq_shift,dist_shift,dist); |
30 | } |
31 | |
32 | template<int N> |
33 | __forceinline void isort_quick_update(vint<N> &dist, const vint<N> &d) { |
34 | dist = align_shift_right<N-1>(dist,permute(d,vint<N>(zero))); |
35 | } |
36 | |
37 | __forceinline size_t permuteExtract(const vint8& index, const vllong4& n0, const vllong4& n1) { |
38 | return toScalar(permutex2var((__m256i)index,n0,n1)); |
39 | } |
40 | |
41 | __forceinline float permuteExtract(const vint8& index, const vfloat8& n) { |
42 | return toScalar(permute(n,index)); |
43 | } |
44 | |
45 | #endif |
46 | |
47 | /* Specialization for BVH4. */ |
48 | template<int types> |
49 | class BVHNNodeTraverser1Hit<4, types> |
50 | { |
51 | typedef BVH4 BVH; |
52 | typedef BVH4::NodeRef NodeRef; |
53 | typedef BVH4::BaseNode BaseNode; |
54 | |
55 | |
56 | public: |
57 | /* Traverses a node with at least one hit child. Optimized for finding the closest hit (intersection). */ |
58 | static __forceinline void traverseClosestHit(NodeRef& cur, |
59 | size_t mask, |
60 | const vfloat4& tNear, |
61 | StackItemT<NodeRef>*& stackPtr, |
62 | StackItemT<NodeRef>* stackEnd) |
63 | { |
64 | assert(mask != 0); |
65 | const BaseNode* node = cur.baseNode(); |
66 | |
67 | /*! one child is hit, continue with that child */ |
68 | size_t r = bscf(v&: mask); |
69 | cur = node->child(i: r); |
70 | BVH::prefetch(ref: cur,types); |
71 | if (likely(mask == 0)) { |
72 | assert(cur != BVH::emptyNode); |
73 | return; |
74 | } |
75 | |
76 | /*! two children are hit, push far child, and continue with closer child */ |
77 | NodeRef c0 = cur; |
78 | const unsigned int d0 = ((unsigned int*)&tNear)[r]; |
79 | r = bscf(v&: mask); |
80 | NodeRef c1 = node->child(i: r); |
81 | BVH::prefetch(ref: c1,types); |
82 | const unsigned int d1 = ((unsigned int*)&tNear)[r]; |
83 | assert(c0 != BVH::emptyNode); |
84 | assert(c1 != BVH::emptyNode); |
85 | if (likely(mask == 0)) { |
86 | assert(stackPtr < stackEnd); |
87 | if (d0 < d1) { stackPtr->ptr = c1; stackPtr->dist = d1; stackPtr++; cur = c0; return; } |
88 | else { stackPtr->ptr = c0; stackPtr->dist = d0; stackPtr++; cur = c1; return; } |
89 | } |
90 | |
91 | #if NEW_SORTING_CODE == 1 |
92 | vint4 s0((size_t)c0,(size_t)d0); |
93 | vint4 s1((size_t)c1,(size_t)d1); |
94 | r = bscf(v&: mask); |
95 | NodeRef c2 = node->child(i: r); BVH::prefetch(ref: c2,types); unsigned int d2 = ((unsigned int*)&tNear)[r]; |
96 | vint4 s2((size_t)c2,(size_t)d2); |
97 | /* 3 hits */ |
98 | if (likely(mask == 0)) { |
99 | StackItemT<NodeRef>::sort3(s1&: s0,s2&: s1,s3&: s2); |
100 | *(vint4*)&stackPtr[0] = s0; *(vint4*)&stackPtr[1] = s1; |
101 | cur = toSizeT(v: s2); |
102 | stackPtr+=2; |
103 | return; |
104 | } |
105 | r = bscf(v&: mask); |
106 | NodeRef c3 = node->child(i: r); BVH::prefetch(ref: c3,types); unsigned int d3 = ((unsigned int*)&tNear)[r]; |
107 | vint4 s3((size_t)c3,(size_t)d3); |
108 | /* 4 hits */ |
109 | StackItemT<NodeRef>::sort4(s1&: s0,s2&: s1,s3&: s2,s4&: s3); |
110 | *(vint4*)&stackPtr[0] = s0; *(vint4*)&stackPtr[1] = s1; *(vint4*)&stackPtr[2] = s2; |
111 | cur = toSizeT(v: s3); |
112 | stackPtr+=3; |
113 | #else |
114 | /*! Here starts the slow path for 3 or 4 hit children. We push |
115 | * all nodes onto the stack to sort them there. */ |
116 | assert(stackPtr < stackEnd); |
117 | stackPtr->ptr = c0; stackPtr->dist = d0; stackPtr++; |
118 | assert(stackPtr < stackEnd); |
119 | stackPtr->ptr = c1; stackPtr->dist = d1; stackPtr++; |
120 | |
121 | /*! three children are hit, push all onto stack and sort 3 stack items, continue with closest child */ |
122 | assert(stackPtr < stackEnd); |
123 | r = bscf(mask); |
124 | NodeRef c = node->child(r); BVH::prefetch(c,types); unsigned int d = ((unsigned int*)&tNear)[r]; stackPtr->ptr = c; stackPtr->dist = d; stackPtr++; |
125 | assert(c != BVH::emptyNode); |
126 | if (likely(mask == 0)) { |
127 | sort(stackPtr[-1],stackPtr[-2],stackPtr[-3]); |
128 | cur = (NodeRef) stackPtr[-1].ptr; stackPtr--; |
129 | return; |
130 | } |
131 | |
132 | /*! four children are hit, push all onto stack and sort 4 stack items, continue with closest child */ |
133 | assert(stackPtr < stackEnd); |
134 | r = bscf(mask); |
135 | c = node->child(r); BVH::prefetch(c,types); d = *(unsigned int*)&tNear[r]; stackPtr->ptr = c; stackPtr->dist = d; stackPtr++; |
136 | assert(c != BVH::emptyNode); |
137 | sort(stackPtr[-1],stackPtr[-2],stackPtr[-3],stackPtr[-4]); |
138 | cur = (NodeRef) stackPtr[-1].ptr; stackPtr--; |
139 | #endif |
140 | } |
141 | |
142 | /* Traverses a node with at least one hit child. Optimized for finding any hit (occlusion). */ |
143 | static __forceinline void traverseAnyHit(NodeRef& cur, |
144 | size_t mask, |
145 | const vfloat4& tNear, |
146 | NodeRef*& stackPtr, |
147 | NodeRef* stackEnd) |
148 | { |
149 | const BaseNode* node = cur.baseNode(); |
150 | |
151 | /*! one child is hit, continue with that child */ |
152 | size_t r = bscf(v&: mask); |
153 | cur = node->child(i: r); |
154 | BVH::prefetch(ref: cur,types); |
155 | |
156 | /* simpler in sequence traversal order */ |
157 | assert(cur != BVH::emptyNode); |
158 | if (likely(mask == 0)) return; |
159 | assert(stackPtr < stackEnd); |
160 | *stackPtr = cur; stackPtr++; |
161 | |
162 | for (; ;) |
163 | { |
164 | r = bscf(v&: mask); |
165 | cur = node->child(i: r); BVH::prefetch(ref: cur,types); |
166 | assert(cur != BVH::emptyNode); |
167 | if (likely(mask == 0)) return; |
168 | assert(stackPtr < stackEnd); |
169 | *stackPtr = cur; stackPtr++; |
170 | } |
171 | } |
172 | }; |
173 | |
174 | /* Specialization for BVH8. */ |
175 | template<int types> |
176 | class BVHNNodeTraverser1Hit<8, types> |
177 | { |
178 | typedef BVH8 BVH; |
179 | typedef BVH8::NodeRef NodeRef; |
180 | typedef BVH8::BaseNode BaseNode; |
181 | |
182 | #if defined(__AVX512VL__) |
183 | template<class NodeRef, class BaseNode> |
184 | static __forceinline void traverseClosestHitAVX512VL8(NodeRef& cur, |
185 | size_t mask, |
186 | const vfloat8& tNear, |
187 | StackItemT<NodeRef>*& stackPtr, |
188 | StackItemT<NodeRef>* stackEnd) |
189 | { |
190 | assert(mask != 0); |
191 | const BaseNode* node = cur.baseNode(); |
192 | const vllong4 n0 = vllong4::loadu((vllong4*)&node->children[0]); |
193 | const vllong4 n1 = vllong4::loadu((vllong4*)&node->children[4]); |
194 | vint8 distance_i = (asInt(tNear) & 0xfffffff8) | vint8(step); |
195 | distance_i = vint8::compact((int)mask,distance_i,distance_i); |
196 | cur = permuteExtract(distance_i,n0,n1); |
197 | BVH::prefetch(cur,types); |
198 | |
199 | mask &= mask-1; |
200 | if (likely(mask == 0)) return; |
201 | |
202 | /* 2 hits: order A0 B0 */ |
203 | const vint8 d0(distance_i); |
204 | const vint8 d1(shuffle<1>(distance_i)); |
205 | cur = permuteExtract(d1,n0,n1); |
206 | BVH::prefetch(cur,types); |
207 | |
208 | const vint8 dist_A0 = min(d0, d1); |
209 | const vint8 dist_B0 = max(d0, d1); |
210 | assert(dist_A0[0] < dist_B0[0]); |
211 | |
212 | mask &= mask-1; |
213 | if (likely(mask == 0)) { |
214 | cur = permuteExtract(dist_A0,n0,n1); |
215 | stackPtr[0].ptr = permuteExtract(dist_B0,n0,n1); |
216 | *(float*)&stackPtr[0].dist = permuteExtract(dist_B0,tNear); |
217 | stackPtr++; |
218 | return; |
219 | } |
220 | |
221 | /* 3 hits: order A1 B1 C1 */ |
222 | |
223 | const vint8 d2(shuffle<2>(distance_i)); |
224 | cur = permuteExtract(d2,n0,n1); |
225 | BVH::prefetch(cur,types); |
226 | |
227 | const vint8 dist_A1 = min(dist_A0,d2); |
228 | const vint8 dist_tmp_B1 = max(dist_A0,d2); |
229 | const vint8 dist_B1 = min(dist_B0,dist_tmp_B1); |
230 | const vint8 dist_C1 = max(dist_B0,dist_tmp_B1); |
231 | assert(dist_A1[0] < dist_B1[0]); |
232 | assert(dist_B1[0] < dist_C1[0]); |
233 | |
234 | mask &= mask-1; |
235 | if (likely(mask == 0)) { |
236 | cur = permuteExtract(dist_A1,n0,n1); |
237 | stackPtr[0].ptr = permuteExtract(dist_C1,n0,n1); |
238 | *(float*)&stackPtr[0].dist = permuteExtract(dist_C1,tNear); |
239 | stackPtr[1].ptr = permuteExtract(dist_B1,n0,n1); |
240 | *(float*)&stackPtr[1].dist = permuteExtract(dist_B1,tNear); |
241 | stackPtr+=2; |
242 | return; |
243 | } |
244 | |
245 | /* 4 hits: order A2 B2 C2 D2 */ |
246 | |
247 | const vint8 d3(shuffle<3>(distance_i)); |
248 | cur = permuteExtract(d3,n0,n1); |
249 | BVH::prefetch(cur,types); |
250 | |
251 | const vint8 dist_A2 = min(dist_A1,d3); |
252 | const vint8 dist_tmp_B2 = max(dist_A1,d3); |
253 | const vint8 dist_B2 = min(dist_B1,dist_tmp_B2); |
254 | const vint8 dist_tmp_C2 = max(dist_B1,dist_tmp_B2); |
255 | const vint8 dist_C2 = min(dist_C1,dist_tmp_C2); |
256 | const vint8 dist_D2 = max(dist_C1,dist_tmp_C2); |
257 | assert(dist_A2[0] < dist_B2[0]); |
258 | assert(dist_B2[0] < dist_C2[0]); |
259 | assert(dist_C2[0] < dist_D2[0]); |
260 | |
261 | mask &= mask-1; |
262 | if (likely(mask == 0)) { |
263 | cur = permuteExtract(dist_A2,n0,n1); |
264 | stackPtr[0].ptr = permuteExtract(dist_D2,n0,n1); |
265 | *(float*)&stackPtr[0].dist = permuteExtract(dist_D2,tNear); |
266 | stackPtr[1].ptr = permuteExtract(dist_C2,n0,n1); |
267 | *(float*)&stackPtr[1].dist = permuteExtract(dist_C2,tNear); |
268 | stackPtr[2].ptr = permuteExtract(dist_B2,n0,n1); |
269 | *(float*)&stackPtr[2].dist = permuteExtract(dist_B2,tNear); |
270 | stackPtr+=3; |
271 | return; |
272 | } |
273 | |
274 | /* >=5 hits: reverse to descending order for writing to stack */ |
275 | |
276 | distance_i = align_shift_right<3>(distance_i,distance_i); |
277 | const size_t hits = 4 + popcnt(mask); |
278 | vint8 dist(INT_MIN); // this will work with -0.0f (0x80000000) as distance, isort_update uses >= to insert |
279 | |
280 | isort_quick_update<8>(dist,dist_A2); |
281 | isort_quick_update<8>(dist,dist_B2); |
282 | isort_quick_update<8>(dist,dist_C2); |
283 | isort_quick_update<8>(dist,dist_D2); |
284 | |
285 | do { |
286 | |
287 | distance_i = align_shift_right<1>(distance_i,distance_i); |
288 | cur = permuteExtract(distance_i,n0,n1); |
289 | BVH::prefetch(cur,types); |
290 | const vint8 new_dist(permute(distance_i,vint8(zero))); |
291 | mask &= mask-1; |
292 | isort_update<8>(dist,new_dist); |
293 | |
294 | } while(mask); |
295 | |
296 | for (size_t i=0; i<7; i++) |
297 | assert(dist[i+0]>=dist[i+1]); |
298 | |
299 | for (size_t i=0;i<hits-1;i++) |
300 | { |
301 | stackPtr->ptr = permuteExtract(dist,n0,n1); |
302 | *(float*)&stackPtr->dist = permuteExtract(dist,tNear); |
303 | dist = align_shift_right<1>(dist,dist); |
304 | stackPtr++; |
305 | } |
306 | cur = permuteExtract(dist,n0,n1); |
307 | } |
308 | #endif |
309 | |
310 | public: |
311 | static __forceinline void traverseClosestHit(NodeRef& cur, |
312 | size_t mask, |
313 | const vfloat8& tNear, |
314 | StackItemT<NodeRef>*& stackPtr, |
315 | StackItemT<NodeRef>* stackEnd) |
316 | { |
317 | assert(mask != 0); |
318 | #if defined(__AVX512VL__) |
319 | traverseClosestHitAVX512VL8<NodeRef,BaseNode>(cur,mask,tNear,stackPtr,stackEnd); |
320 | #else |
321 | |
322 | const BaseNode* node = cur.baseNode(); |
323 | |
324 | /*! one child is hit, continue with that child */ |
325 | size_t r = bscf(v&: mask); |
326 | cur = node->child(i: r); |
327 | BVH::prefetch(ref: cur,types); |
328 | if (likely(mask == 0)) { |
329 | assert(cur != BVH::emptyNode); |
330 | return; |
331 | } |
332 | |
333 | /*! two children are hit, push far child, and continue with closer child */ |
334 | NodeRef c0 = cur; |
335 | const unsigned int d0 = ((unsigned int*)&tNear)[r]; |
336 | r = bscf(v&: mask); |
337 | NodeRef c1 = node->child(i: r); |
338 | BVH::prefetch(ref: c1,types); |
339 | const unsigned int d1 = ((unsigned int*)&tNear)[r]; |
340 | |
341 | assert(c0 != BVH::emptyNode); |
342 | assert(c1 != BVH::emptyNode); |
343 | if (likely(mask == 0)) { |
344 | assert(stackPtr < stackEnd); |
345 | if (d0 < d1) { stackPtr->ptr = c1; stackPtr->dist = d1; stackPtr++; cur = c0; return; } |
346 | else { stackPtr->ptr = c0; stackPtr->dist = d0; stackPtr++; cur = c1; return; } |
347 | } |
348 | #if NEW_SORTING_CODE == 1 |
349 | vint4 s0((size_t)c0,(size_t)d0); |
350 | vint4 s1((size_t)c1,(size_t)d1); |
351 | |
352 | r = bscf(v&: mask); |
353 | NodeRef c2 = node->child(i: r); BVH::prefetch(ref: c2,types); unsigned int d2 = ((unsigned int*)&tNear)[r]; |
354 | vint4 s2((size_t)c2,(size_t)d2); |
355 | /* 3 hits */ |
356 | if (likely(mask == 0)) { |
357 | StackItemT<NodeRef>::sort3(s1&: s0,s2&: s1,s3&: s2); |
358 | *(vint4*)&stackPtr[0] = s0; *(vint4*)&stackPtr[1] = s1; |
359 | cur = toSizeT(v: s2); |
360 | stackPtr+=2; |
361 | return; |
362 | } |
363 | r = bscf(v&: mask); |
364 | NodeRef c3 = node->child(i: r); BVH::prefetch(ref: c3,types); unsigned int d3 = ((unsigned int*)&tNear)[r]; |
365 | vint4 s3((size_t)c3,(size_t)d3); |
366 | /* 4 hits */ |
367 | if (likely(mask == 0)) { |
368 | StackItemT<NodeRef>::sort4(s1&: s0,s2&: s1,s3&: s2,s4&: s3); |
369 | *(vint4*)&stackPtr[0] = s0; *(vint4*)&stackPtr[1] = s1; *(vint4*)&stackPtr[2] = s2; |
370 | cur = toSizeT(v: s3); |
371 | stackPtr+=3; |
372 | return; |
373 | } |
374 | *(vint4*)&stackPtr[0] = s0; *(vint4*)&stackPtr[1] = s1; *(vint4*)&stackPtr[2] = s2; *(vint4*)&stackPtr[3] = s3; |
375 | /*! fallback case if more than 4 children are hit */ |
376 | StackItemT<NodeRef>* stackFirst = stackPtr; |
377 | stackPtr+=4; |
378 | while (1) |
379 | { |
380 | assert(stackPtr < stackEnd); |
381 | r = bscf(v&: mask); |
382 | NodeRef c = node->child(i: r); BVH::prefetch(ref: c,types); unsigned int d = *(unsigned int*)&tNear[r]; |
383 | const vint4 s((size_t)c,(size_t)d); |
384 | *(vint4*)stackPtr++ = s; |
385 | assert(c != BVH::emptyNode); |
386 | if (unlikely(mask == 0)) break; |
387 | } |
388 | sort(begin: stackFirst,end: stackPtr); |
389 | cur = (NodeRef) stackPtr[-1].ptr; stackPtr--; |
390 | #else |
391 | /*! Here starts the slow path for 3 or 4 hit children. We push |
392 | * all nodes onto the stack to sort them there. */ |
393 | assert(stackPtr < stackEnd); |
394 | stackPtr->ptr = c0; stackPtr->dist = d0; stackPtr++; |
395 | assert(stackPtr < stackEnd); |
396 | stackPtr->ptr = c1; stackPtr->dist = d1; stackPtr++; |
397 | |
398 | /*! three children are hit, push all onto stack and sort 3 stack items, continue with closest child */ |
399 | assert(stackPtr < stackEnd); |
400 | r = bscf(mask); |
401 | NodeRef c = node->child(r); BVH::prefetch(c,types); unsigned int d = ((unsigned int*)&tNear)[r]; stackPtr->ptr = c; stackPtr->dist = d; stackPtr++; |
402 | assert(c != BVH::emptyNode); |
403 | if (likely(mask == 0)) { |
404 | sort(stackPtr[-1],stackPtr[-2],stackPtr[-3]); |
405 | cur = (NodeRef) stackPtr[-1].ptr; stackPtr--; |
406 | return; |
407 | } |
408 | |
409 | /*! four children are hit, push all onto stack and sort 4 stack items, continue with closest child */ |
410 | assert(stackPtr < stackEnd); |
411 | r = bscf(mask); |
412 | c = node->child(r); BVH::prefetch(c,types); d = *(unsigned int*)&tNear[r]; stackPtr->ptr = c; stackPtr->dist = d; stackPtr++; |
413 | assert(c != BVH::emptyNode); |
414 | if (likely(mask == 0)) { |
415 | sort(stackPtr[-1],stackPtr[-2],stackPtr[-3],stackPtr[-4]); |
416 | cur = (NodeRef) stackPtr[-1].ptr; stackPtr--; |
417 | return; |
418 | } |
419 | /*! fallback case if more than 4 children are hit */ |
420 | StackItemT<NodeRef>* stackFirst = stackPtr-4; |
421 | while (1) |
422 | { |
423 | assert(stackPtr < stackEnd); |
424 | r = bscf(mask); |
425 | c = node->child(r); BVH::prefetch(c,types); d = *(unsigned int*)&tNear[r]; stackPtr->ptr = c; stackPtr->dist = d; stackPtr++; |
426 | assert(c != BVH::emptyNode); |
427 | if (unlikely(mask == 0)) break; |
428 | } |
429 | sort(stackFirst,stackPtr); |
430 | cur = (NodeRef) stackPtr[-1].ptr; stackPtr--; |
431 | #endif |
432 | #endif |
433 | } |
434 | |
435 | static __forceinline void traverseAnyHit(NodeRef& cur, |
436 | size_t mask, |
437 | const vfloat8& tNear, |
438 | NodeRef*& stackPtr, |
439 | NodeRef* stackEnd) |
440 | { |
441 | const BaseNode* node = cur.baseNode(); |
442 | |
443 | /*! one child is hit, continue with that child */ |
444 | size_t r = bscf(v&: mask); |
445 | cur = node->child(i: r); |
446 | BVH::prefetch(ref: cur,types); |
447 | |
448 | /* simpler in sequence traversal order */ |
449 | assert(cur != BVH::emptyNode); |
450 | if (likely(mask == 0)) return; |
451 | assert(stackPtr < stackEnd); |
452 | *stackPtr = cur; stackPtr++; |
453 | |
454 | for (; ;) |
455 | { |
456 | r = bscf(v&: mask); |
457 | cur = node->child(i: r); BVH::prefetch(ref: cur,types); |
458 | assert(cur != BVH::emptyNode); |
459 | if (likely(mask == 0)) return; |
460 | assert(stackPtr < stackEnd); |
461 | *stackPtr = cur; stackPtr++; |
462 | } |
463 | } |
464 | }; |
465 | } |
466 | } |
467 | |