1 | // Copyright 2009-2021 Intel Corporation |
2 | // SPDX-License-Identifier: Apache-2.0 |
3 | |
4 | #include "bvh_rotate.h" |
5 | |
6 | namespace embree |
7 | { |
8 | namespace isa |
9 | { |
10 | /*! Computes half surface area of box. */ |
11 | __forceinline float halfArea3f(const BBox<vfloat4>& box) { |
12 | const vfloat4 d = box.size(); |
13 | const vfloat4 a = d*shuffle<1,2,0,3>(v: d); |
14 | return a[0]+a[1]+a[2]; |
15 | } |
16 | |
17 | size_t BVHNRotate<4>::rotate(NodeRef parentRef, size_t depth) |
18 | { |
19 | /*! nothing to rotate if we reached a leaf node. */ |
20 | if (parentRef.isBarrier()) return 0; |
21 | if (parentRef.isLeaf()) return 0; |
22 | AABBNode* parent = parentRef.getAABBNode(); |
23 | |
24 | /*! rotate all children first */ |
25 | vint4 cdepth; |
26 | for (size_t c=0; c<4; c++) |
27 | cdepth[c] = (int)rotate(parentRef: parent->child(i: c),depth: depth+1); |
28 | |
29 | /* compute current areas of all children */ |
30 | vfloat4 sizeX = parent->upper_x-parent->lower_x; |
31 | vfloat4 sizeY = parent->upper_y-parent->lower_y; |
32 | vfloat4 sizeZ = parent->upper_z-parent->lower_z; |
33 | vfloat4 childArea = madd(a: sizeX,b: (sizeY + sizeZ),c: sizeY*sizeZ); |
34 | |
35 | /*! get node bounds */ |
36 | BBox<vfloat4> child1_0,child1_1,child1_2,child1_3; |
37 | parent->bounds(bounds0&: child1_0,bounds1&: child1_1,bounds2&: child1_2,bounds3&: child1_3); |
38 | |
39 | /*! Find best rotation. We pick a first child (child1) and a sub-child |
40 | (child2child) of a different second child (child2), and swap child1 |
41 | and child2child. We perform the best such swap. */ |
42 | float bestArea = 0; |
43 | size_t bestChild1 = -1, bestChild2 = -1, bestChild2Child = -1; |
44 | for (size_t c2=0; c2<4; c2++) |
45 | { |
46 | /*! ignore leaf nodes as we cannot descent into them */ |
47 | if (parent->child(i: c2).isBarrier()) continue; |
48 | if (parent->child(i: c2).isLeaf()) continue; |
49 | AABBNode* child2 = parent->child(i: c2).getAABBNode(); |
50 | |
51 | /*! transpose child bounds */ |
52 | BBox<vfloat4> child2c0,child2c1,child2c2,child2c3; |
53 | child2->bounds(bounds0&: child2c0,bounds1&: child2c1,bounds2&: child2c2,bounds3&: child2c3); |
54 | |
55 | /*! put child1_0 at each child2 position */ |
56 | float cost00 = halfArea3f(box: merge(a: child1_0,b: child2c1,c: child2c2,d: child2c3)); |
57 | float cost01 = halfArea3f(box: merge(a: child2c0,b: child1_0,c: child2c2,d: child2c3)); |
58 | float cost02 = halfArea3f(box: merge(a: child2c0,b: child2c1,c: child1_0,d: child2c3)); |
59 | float cost03 = halfArea3f(box: merge(a: child2c0,b: child2c1,c: child2c2,d: child1_0)); |
60 | vfloat4 cost0 = vfloat4(cost00,cost01,cost02,cost03); |
61 | vfloat4 min0 = vreduce_min(v: cost0); |
62 | int pos0 = (int)bsf(v: movemask(a: min0 == cost0)); |
63 | |
64 | /*! put child1_1 at each child2 position */ |
65 | float cost10 = halfArea3f(box: merge(a: child1_1,b: child2c1,c: child2c2,d: child2c3)); |
66 | float cost11 = halfArea3f(box: merge(a: child2c0,b: child1_1,c: child2c2,d: child2c3)); |
67 | float cost12 = halfArea3f(box: merge(a: child2c0,b: child2c1,c: child1_1,d: child2c3)); |
68 | float cost13 = halfArea3f(box: merge(a: child2c0,b: child2c1,c: child2c2,d: child1_1)); |
69 | vfloat4 cost1 = vfloat4(cost10,cost11,cost12,cost13); |
70 | vfloat4 min1 = vreduce_min(v: cost1); |
71 | int pos1 = (int)bsf(v: movemask(a: min1 == cost1)); |
72 | |
73 | /*! put child1_2 at each child2 position */ |
74 | float cost20 = halfArea3f(box: merge(a: child1_2,b: child2c1,c: child2c2,d: child2c3)); |
75 | float cost21 = halfArea3f(box: merge(a: child2c0,b: child1_2,c: child2c2,d: child2c3)); |
76 | float cost22 = halfArea3f(box: merge(a: child2c0,b: child2c1,c: child1_2,d: child2c3)); |
77 | float cost23 = halfArea3f(box: merge(a: child2c0,b: child2c1,c: child2c2,d: child1_2)); |
78 | vfloat4 cost2 = vfloat4(cost20,cost21,cost22,cost23); |
79 | vfloat4 min2 = vreduce_min(v: cost2); |
80 | int pos2 = (int)bsf(v: movemask(a: min2 == cost2)); |
81 | |
82 | /*! put child1_3 at each child2 position */ |
83 | float cost30 = halfArea3f(box: merge(a: child1_3,b: child2c1,c: child2c2,d: child2c3)); |
84 | float cost31 = halfArea3f(box: merge(a: child2c0,b: child1_3,c: child2c2,d: child2c3)); |
85 | float cost32 = halfArea3f(box: merge(a: child2c0,b: child2c1,c: child1_3,d: child2c3)); |
86 | float cost33 = halfArea3f(box: merge(a: child2c0,b: child2c1,c: child2c2,d: child1_3)); |
87 | vfloat4 cost3 = vfloat4(cost30,cost31,cost32,cost33); |
88 | vfloat4 min3 = vreduce_min(v: cost3); |
89 | int pos3 = (int)bsf(v: movemask(a: min3 == cost3)); |
90 | |
91 | /*! find best other child */ |
92 | vfloat4 area0123 = vfloat4(extract<0>(a: min0),extract<0>(a: min1),extract<0>(a: min2),extract<0>(a: min3)) - vfloat4(childArea[c2]); |
93 | int pos[4] = { pos0,pos1,pos2,pos3 }; |
94 | const size_t mbd = BVH4::maxBuildDepth; |
95 | vbool4 valid = vint4(int(depth+1))+cdepth <= vint4(mbd); // only select swaps that fulfill depth constraints |
96 | valid &= vint4(int(c2)) != vint4(step); |
97 | if (none(b: valid)) continue; |
98 | size_t c1 = select_min(valid,v: area0123); |
99 | float area = area0123[c1]; |
100 | if (c1 == c2) continue; // can happen if bounds are NANs |
101 | |
102 | /*! accept a swap when it reduces cost and is not swapping a node with itself */ |
103 | if (area < bestArea) { |
104 | bestArea = area; |
105 | bestChild1 = c1; |
106 | bestChild2 = c2; |
107 | bestChild2Child = pos[c1]; |
108 | } |
109 | } |
110 | |
111 | /*! if we did not find a swap that improves the SAH then do nothing */ |
112 | if (bestChild1 == size_t(-1)) return 1+reduce_max(v: cdepth); |
113 | |
114 | /*! perform the best found tree rotation */ |
115 | AABBNode* child2 = parent->child(i: bestChild2).getAABBNode(); |
116 | AABBNode::swap(a: parent,i: bestChild1,b: child2,j: bestChild2Child); |
117 | parent->setBounds(i: bestChild2,bounds: child2->bounds()); |
118 | AABBNode::compact(a: parent); |
119 | AABBNode::compact(a: child2); |
120 | |
121 | /*! This returned depth is conservative as the child that was |
122 | * pulled up in the tree could have been on the critical path. */ |
123 | cdepth[bestChild1]++; // bestChild1 was pushed down one level |
124 | return 1+reduce_max(v: cdepth); |
125 | } |
126 | } |
127 | } |
128 | |