1 | // Copyright 2009-2021 Intel Corporation |
2 | // SPDX-License-Identifier: Apache-2.0 |
3 | |
4 | #pragma once |
5 | |
6 | #include "bvh_node_base.h" |
7 | |
8 | namespace embree |
9 | { |
10 | /*! BVHN Quantized Node */ |
11 | template<int N> |
12 | struct __aligned(8) QuantizedBaseNode_t |
13 | { |
14 | typedef unsigned char T; |
15 | static const T MIN_QUAN = 0; |
16 | static const T MAX_QUAN = 255; |
17 | |
18 | /*! Clears the node. */ |
19 | __forceinline void clear() { |
20 | for (size_t i=0; i<N; i++) lower_x[i] = lower_y[i] = lower_z[i] = MAX_QUAN; |
21 | for (size_t i=0; i<N; i++) upper_x[i] = upper_y[i] = upper_z[i] = MIN_QUAN; |
22 | } |
23 | |
24 | /*! Returns bounds of specified child. */ |
25 | __forceinline BBox3fa bounds(size_t i) const |
26 | { |
27 | assert(i < N); |
28 | const Vec3fa lower(madd(a: scale.x,b: (float)lower_x[i],c: start.x), |
29 | madd(a: scale.y,b: (float)lower_y[i],c: start.y), |
30 | madd(a: scale.z,b: (float)lower_z[i],c: start.z)); |
31 | const Vec3fa upper(madd(a: scale.x,b: (float)upper_x[i],c: start.x), |
32 | madd(a: scale.y,b: (float)upper_y[i],c: start.y), |
33 | madd(a: scale.z,b: (float)upper_z[i],c: start.z)); |
34 | return BBox3fa(lower,upper); |
35 | } |
36 | |
37 | /*! Returns extent of bounds of specified child. */ |
38 | __forceinline Vec3fa extent(size_t i) const { |
39 | return bounds(i).size(); |
40 | } |
41 | |
42 | static __forceinline void init_dim(const vfloat<N> &lower, |
43 | const vfloat<N> &upper, |
44 | T lower_quant[N], |
45 | T upper_quant[N], |
46 | float &start, |
47 | float &scale) |
48 | { |
49 | /* quantize bounds */ |
50 | const vbool<N> m_valid = lower != vfloat<N>(pos_inf); |
51 | const float minF = reduce_min(lower); |
52 | const float maxF = reduce_max(upper); |
53 | float diff = (1.0f+2.0f*float(ulp))*(maxF - minF); |
54 | float decode_scale = diff / float(MAX_QUAN); |
55 | if (decode_scale == 0.0f) decode_scale = 2.0f*FLT_MIN; // result may have been flushed to zero |
56 | assert(madd(decode_scale,float(MAX_QUAN),minF) >= maxF); |
57 | const float encode_scale = diff > 0 ? (float(MAX_QUAN) / diff) : 0.0f; |
58 | vint<N> ilower = max(vint<N>(floor((lower - vfloat<N>(minF))*vfloat<N>(encode_scale))),MIN_QUAN); |
59 | vint<N> iupper = min(vint<N>(ceil ((upper - vfloat<N>(minF))*vfloat<N>(encode_scale))),MAX_QUAN); |
60 | |
61 | /* lower/upper correction */ |
62 | vbool<N> m_lower_correction = (madd(vfloat<N>(ilower),decode_scale,minF)) > lower; |
63 | vbool<N> m_upper_correction = (madd(vfloat<N>(iupper),decode_scale,minF)) < upper; |
64 | ilower = max(select(m_lower_correction,ilower-1,ilower),MIN_QUAN); |
65 | iupper = min(select(m_upper_correction,iupper+1,iupper),MAX_QUAN); |
66 | |
67 | /* disable invalid lanes */ |
68 | ilower = select(m_valid,ilower,MAX_QUAN); |
69 | iupper = select(m_valid,iupper,MIN_QUAN); |
70 | |
71 | /* store as uchar to memory */ |
72 | vint<N>::store(lower_quant,ilower); |
73 | vint<N>::store(upper_quant,iupper); |
74 | start = minF; |
75 | scale = decode_scale; |
76 | |
77 | #if defined(DEBUG) |
78 | vfloat<N> extract_lower( vint<N>::loadu(lower_quant) ); |
79 | vfloat<N> extract_upper( vint<N>::loadu(upper_quant) ); |
80 | vfloat<N> final_extract_lower = madd(extract_lower,decode_scale,minF); |
81 | vfloat<N> final_extract_upper = madd(extract_upper,decode_scale,minF); |
82 | assert( (movemask(final_extract_lower <= lower ) & movemask(m_valid)) == movemask(m_valid)); |
83 | assert( (movemask(final_extract_upper >= upper ) & movemask(m_valid)) == movemask(m_valid)); |
84 | #endif |
85 | } |
86 | |
87 | __forceinline void init_dim(AABBNode_t<NodeRefPtr<N>,N>& node) |
88 | { |
89 | init_dim(node.lower_x,node.upper_x,lower_x,upper_x,start.x,scale.x); |
90 | init_dim(node.lower_y,node.upper_y,lower_y,upper_y,start.y,scale.y); |
91 | init_dim(node.lower_z,node.upper_z,lower_z,upper_z,start.z,scale.z); |
92 | } |
93 | |
94 | __forceinline vbool<N> validMask() const { return vint<N>::loadu(lower_x) <= vint<N>::loadu(upper_x); } |
95 | |
96 | #if defined(__AVX512F__) // KNL |
97 | __forceinline vbool16 validMask16() const { return le(0xff,vint<16>::loadu(lower_x),vint<16>::loadu(upper_x)); } |
98 | #endif |
99 | __forceinline vfloat<N> dequantizeLowerX() const { return madd(vfloat<N>(vint<N>::loadu(lower_x)),scale.x,vfloat<N>(start.x)); } |
100 | |
101 | __forceinline vfloat<N> dequantizeUpperX() const { return madd(vfloat<N>(vint<N>::loadu(upper_x)),scale.x,vfloat<N>(start.x)); } |
102 | |
103 | __forceinline vfloat<N> dequantizeLowerY() const { return madd(vfloat<N>(vint<N>::loadu(lower_y)),scale.y,vfloat<N>(start.y)); } |
104 | |
105 | __forceinline vfloat<N> dequantizeUpperY() const { return madd(vfloat<N>(vint<N>::loadu(upper_y)),scale.y,vfloat<N>(start.y)); } |
106 | |
107 | __forceinline vfloat<N> dequantizeLowerZ() const { return madd(vfloat<N>(vint<N>::loadu(lower_z)),scale.z,vfloat<N>(start.z)); } |
108 | |
109 | __forceinline vfloat<N> dequantizeUpperZ() const { return madd(vfloat<N>(vint<N>::loadu(upper_z)),scale.z,vfloat<N>(start.z)); } |
110 | |
111 | template <int M> |
112 | __forceinline vfloat<M> dequantize(const size_t offset) const { return vfloat<M>(vint<M>::loadu(all_planes+offset)); } |
113 | |
114 | #if defined(__AVX512F__) |
115 | __forceinline vfloat16 dequantizeLowerUpperX(const vint16 &p) const { return madd(vfloat16(permute(vint<16>::loadu(lower_x),p)),scale.x,vfloat16(start.x)); } |
116 | __forceinline vfloat16 dequantizeLowerUpperY(const vint16 &p) const { return madd(vfloat16(permute(vint<16>::loadu(lower_y),p)),scale.y,vfloat16(start.y)); } |
117 | __forceinline vfloat16 dequantizeLowerUpperZ(const vint16 &p) const { return madd(vfloat16(permute(vint<16>::loadu(lower_z),p)),scale.z,vfloat16(start.z)); } |
118 | #endif |
119 | |
120 | union { |
121 | struct { |
122 | T lower_x[N]; //!< 8bit discretized X dimension of lower bounds of all N children |
123 | T upper_x[N]; //!< 8bit discretized X dimension of upper bounds of all N children |
124 | T lower_y[N]; //!< 8bit discretized Y dimension of lower bounds of all N children |
125 | T upper_y[N]; //!< 8bit discretized Y dimension of upper bounds of all N children |
126 | T lower_z[N]; //!< 8bit discretized Z dimension of lower bounds of all N children |
127 | T upper_z[N]; //!< 8bit discretized Z dimension of upper bounds of all N children |
128 | }; |
129 | T all_planes[6*N]; |
130 | }; |
131 | |
132 | Vec3f start; |
133 | Vec3f scale; |
134 | |
135 | friend embree_ostream operator<<(embree_ostream o, const QuantizedBaseNode_t& n) |
136 | { |
137 | o << "QuantizedBaseNode { " << embree_endl; |
138 | o << " start " << n.start << embree_endl; |
139 | o << " scale " << n.scale << embree_endl; |
140 | o << " lower_x " << vuint<N>::loadu(n.lower_x) << embree_endl; |
141 | o << " upper_x " << vuint<N>::loadu(n.upper_x) << embree_endl; |
142 | o << " lower_y " << vuint<N>::loadu(n.lower_y) << embree_endl; |
143 | o << " upper_y " << vuint<N>::loadu(n.upper_y) << embree_endl; |
144 | o << " lower_z " << vuint<N>::loadu(n.lower_z) << embree_endl; |
145 | o << " upper_z " << vuint<N>::loadu(n.upper_z) << embree_endl; |
146 | o << "}" << embree_endl; |
147 | return o; |
148 | } |
149 | |
150 | }; |
151 | |
152 | template<typename NodeRef, int N> |
153 | struct __aligned(8) QuantizedNode_t : public BaseNode_t<NodeRef, N>, QuantizedBaseNode_t<N> |
154 | { |
155 | using BaseNode_t<NodeRef,N>::children; |
156 | using QuantizedBaseNode_t<N>::lower_x; |
157 | using QuantizedBaseNode_t<N>::upper_x; |
158 | using QuantizedBaseNode_t<N>::lower_y; |
159 | using QuantizedBaseNode_t<N>::upper_y; |
160 | using QuantizedBaseNode_t<N>::lower_z; |
161 | using QuantizedBaseNode_t<N>::upper_z; |
162 | using QuantizedBaseNode_t<N>::start; |
163 | using QuantizedBaseNode_t<N>::scale; |
164 | using QuantizedBaseNode_t<N>::init_dim; |
165 | |
166 | __forceinline void setRef(size_t i, const NodeRef& ref) { |
167 | assert(i < N); |
168 | children[i] = ref; |
169 | } |
170 | |
171 | struct Create2 |
172 | { |
173 | template<typename BuildRecord> |
174 | __forceinline NodeRef operator() (BuildRecord* children, const size_t n, const FastAllocator::CachedAllocator& alloc) const |
175 | { |
176 | __aligned(64) AABBNode_t<NodeRef,N> node; |
177 | node.clear(); |
178 | for (size_t i=0; i<n; i++) { |
179 | node.setBounds(i,children[i].bounds()); |
180 | } |
181 | QuantizedNode_t *qnode = (QuantizedNode_t*) alloc.malloc0(bytes: sizeof(QuantizedNode_t), align: NodeRef::byteAlignment); |
182 | qnode->init(node); |
183 | |
184 | return (size_t)qnode | NodeRef::tyQuantizedNode; |
185 | } |
186 | }; |
187 | |
188 | struct Set2 |
189 | { |
190 | template<typename BuildRecord> |
191 | __forceinline NodeRef operator() (const BuildRecord& precord, const BuildRecord* crecords, NodeRef ref, NodeRef* children, const size_t num) const |
192 | { |
193 | #if defined(DEBUG) |
194 | // check that empty children are only at the end of the child list |
195 | bool emptyChild = false; |
196 | for (size_t i=0; i<num; i++) { |
197 | emptyChild |= (children[i] == NodeRef::emptyNode); |
198 | assert(emptyChild == (children[i] == NodeRef::emptyNode)); |
199 | } |
200 | #endif |
201 | QuantizedNode_t* node = ref.quantizedNode(); |
202 | for (size_t i=0; i<num; i++) node->setRef(i,children[i]); |
203 | return ref; |
204 | } |
205 | }; |
206 | |
207 | __forceinline void init(AABBNode_t<NodeRef,N>& node) |
208 | { |
209 | for (size_t i=0;i<N;i++) children[i] = NodeRef::emptyNode; |
210 | init_dim(node); |
211 | } |
212 | |
213 | }; |
214 | |
215 | /*! BVHN Quantized Node */ |
216 | template<int N> |
217 | struct __aligned(8) QuantizedBaseNodeMB_t |
218 | { |
219 | QuantizedBaseNode_t<N> node0; |
220 | QuantizedBaseNode_t<N> node1; |
221 | |
222 | /*! Clears the node. */ |
223 | __forceinline void clear() { |
224 | node0.clear(); |
225 | node1.clear(); |
226 | } |
227 | |
228 | /*! Returns bounds of specified child. */ |
229 | __forceinline BBox3fa bounds(size_t i) const |
230 | { |
231 | assert(i < N); |
232 | BBox3fa bounds0 = node0.bounds(i); |
233 | BBox3fa bounds1 = node1.bounds(i); |
234 | bounds0.extend(other: bounds1); |
235 | return bounds0; |
236 | } |
237 | |
238 | /*! Returns extent of bounds of specified child. */ |
239 | __forceinline Vec3fa extent(size_t i) const { |
240 | return bounds(i).size(); |
241 | } |
242 | |
243 | __forceinline vbool<N> validMask() const { return node0.validMask(); } |
244 | |
245 | template<typename T> |
246 | __forceinline vfloat<N> dequantizeLowerX(const T t) const { return lerp(node0.dequantizeLowerX(),node1.dequantizeLowerX(),t); } |
247 | template<typename T> |
248 | __forceinline vfloat<N> dequantizeUpperX(const T t) const { return lerp(node0.dequantizeUpperX(),node1.dequantizeUpperX(),t); } |
249 | template<typename T> |
250 | __forceinline vfloat<N> dequantizeLowerY(const T t) const { return lerp(node0.dequantizeLowerY(),node1.dequantizeLowerY(),t); } |
251 | template<typename T> |
252 | __forceinline vfloat<N> dequantizeUpperY(const T t) const { return lerp(node0.dequantizeUpperY(),node1.dequantizeUpperY(),t); } |
253 | template<typename T> |
254 | __forceinline vfloat<N> dequantizeLowerZ(const T t) const { return lerp(node0.dequantizeLowerZ(),node1.dequantizeLowerZ(),t); } |
255 | template<typename T> |
256 | __forceinline vfloat<N> dequantizeUpperZ(const T t) const { return lerp(node0.dequantizeUpperZ(),node1.dequantizeUpperZ(),t); } |
257 | |
258 | |
259 | template<int M> |
260 | __forceinline vfloat<M> dequantizeLowerX(const size_t i, const vfloat<M> &t) const { return lerp(vfloat<M>(node0.dequantizeLowerX()[i]),vfloat<M>(node1.dequantizeLowerX()[i]),t); } |
261 | template<int M> |
262 | __forceinline vfloat<M> dequantizeUpperX(const size_t i, const vfloat<M> &t) const { return lerp(vfloat<M>(node0.dequantizeUpperX()[i]),vfloat<M>(node1.dequantizeUpperX()[i]),t); } |
263 | template<int M> |
264 | __forceinline vfloat<M> dequantizeLowerY(const size_t i, const vfloat<M> &t) const { return lerp(vfloat<M>(node0.dequantizeLowerY()[i]),vfloat<M>(node1.dequantizeLowerY()[i]),t); } |
265 | template<int M> |
266 | __forceinline vfloat<M> dequantizeUpperY(const size_t i, const vfloat<M> &t) const { return lerp(vfloat<M>(node0.dequantizeUpperY()[i]),vfloat<M>(node1.dequantizeUpperY()[i]),t); } |
267 | template<int M> |
268 | __forceinline vfloat<M> dequantizeLowerZ(const size_t i, const vfloat<M> &t) const { return lerp(vfloat<M>(node0.dequantizeLowerZ()[i]),vfloat<M>(node1.dequantizeLowerZ()[i]),t); } |
269 | template<int M> |
270 | __forceinline vfloat<M> dequantizeUpperZ(const size_t i, const vfloat<M> &t) const { return lerp(vfloat<M>(node0.dequantizeUpperZ()[i]),vfloat<M>(node1.dequantizeUpperZ()[i]),t); } |
271 | |
272 | }; |
273 | } |
274 | |