| 1 | // Copyright 2009-2021 Intel Corporation | 
| 2 | // SPDX-License-Identifier: Apache-2.0 | 
| 3 |  | 
| 4 | #pragma once | 
| 5 |  | 
| 6 | #include "bvh_node_base.h" | 
| 7 |  | 
| 8 | namespace embree | 
| 9 | { | 
| 10 |   /*! BVHN Quantized Node */ | 
| 11 |   template<int N> | 
| 12 |     struct __aligned(8) QuantizedBaseNode_t | 
| 13 |   { | 
| 14 |     typedef unsigned char T; | 
| 15 |     static const T MIN_QUAN = 0; | 
| 16 |     static const T MAX_QUAN = 255; | 
| 17 |      | 
| 18 |     /*! Clears the node. */ | 
| 19 |     __forceinline void clear() { | 
| 20 |       for (size_t i=0; i<N; i++) lower_x[i] = lower_y[i] = lower_z[i] = MAX_QUAN; | 
| 21 |       for (size_t i=0; i<N; i++) upper_x[i] = upper_y[i] = upper_z[i] = MIN_QUAN; | 
| 22 |     } | 
| 23 |      | 
| 24 |     /*! Returns bounds of specified child. */ | 
| 25 |     __forceinline BBox3fa bounds(size_t i) const | 
| 26 |     { | 
| 27 |       assert(i < N); | 
| 28 |       const Vec3fa lower(madd(a: scale.x,b: (float)lower_x[i],c: start.x), | 
| 29 |                          madd(a: scale.y,b: (float)lower_y[i],c: start.y), | 
| 30 |                          madd(a: scale.z,b: (float)lower_z[i],c: start.z)); | 
| 31 |       const Vec3fa upper(madd(a: scale.x,b: (float)upper_x[i],c: start.x), | 
| 32 |                          madd(a: scale.y,b: (float)upper_y[i],c: start.y), | 
| 33 |                          madd(a: scale.z,b: (float)upper_z[i],c: start.z)); | 
| 34 |       return BBox3fa(lower,upper); | 
| 35 |     } | 
| 36 |      | 
| 37 |     /*! Returns extent of bounds of specified child. */ | 
| 38 |     __forceinline Vec3fa extent(size_t i) const { | 
| 39 |       return bounds(i).size(); | 
| 40 |     } | 
| 41 |      | 
| 42 |     static __forceinline void init_dim(const vfloat<N> &lower, | 
| 43 |                                        const vfloat<N> &upper, | 
| 44 |                                        T lower_quant[N], | 
| 45 |                                        T upper_quant[N], | 
| 46 |                                        float &start, | 
| 47 |                                        float &scale) | 
| 48 |     { | 
| 49 |       /* quantize bounds */ | 
| 50 |       const vbool<N> m_valid = lower != vfloat<N>(pos_inf); | 
| 51 |       const float minF = reduce_min(lower); | 
| 52 |       const float maxF = reduce_max(upper); | 
| 53 |       float diff = (1.0f+2.0f*float(ulp))*(maxF - minF); | 
| 54 |       float decode_scale = diff / float(MAX_QUAN); | 
| 55 |       if (decode_scale == 0.0f) decode_scale = 2.0f*FLT_MIN; // result may have been flushed to zero | 
| 56 |       assert(madd(decode_scale,float(MAX_QUAN),minF) >= maxF); | 
| 57 |       const float encode_scale = diff > 0 ? (float(MAX_QUAN) / diff) : 0.0f; | 
| 58 |       vint<N> ilower = max(vint<N>(floor((lower - vfloat<N>(minF))*vfloat<N>(encode_scale))),MIN_QUAN); | 
| 59 |       vint<N> iupper = min(vint<N>(ceil ((upper - vfloat<N>(minF))*vfloat<N>(encode_scale))),MAX_QUAN); | 
| 60 |        | 
| 61 |       /* lower/upper correction */ | 
| 62 |       vbool<N> m_lower_correction = (madd(vfloat<N>(ilower),decode_scale,minF)) > lower; | 
| 63 |       vbool<N> m_upper_correction = (madd(vfloat<N>(iupper),decode_scale,minF)) < upper; | 
| 64 |       ilower = max(select(m_lower_correction,ilower-1,ilower),MIN_QUAN); | 
| 65 |       iupper = min(select(m_upper_correction,iupper+1,iupper),MAX_QUAN); | 
| 66 |        | 
| 67 |       /* disable invalid lanes */ | 
| 68 |       ilower = select(m_valid,ilower,MAX_QUAN); | 
| 69 |       iupper = select(m_valid,iupper,MIN_QUAN); | 
| 70 |        | 
| 71 |       /* store as uchar to memory */ | 
| 72 |       vint<N>::store(lower_quant,ilower); | 
| 73 |       vint<N>::store(upper_quant,iupper); | 
| 74 |       start = minF; | 
| 75 |       scale = decode_scale; | 
| 76 |        | 
| 77 | #if defined(DEBUG) | 
| 78 |       vfloat<N> extract_lower( vint<N>::loadu(lower_quant) ); | 
| 79 |       vfloat<N> extract_upper( vint<N>::loadu(upper_quant) ); | 
| 80 |       vfloat<N> final_extract_lower = madd(extract_lower,decode_scale,minF); | 
| 81 |       vfloat<N> final_extract_upper = madd(extract_upper,decode_scale,minF); | 
| 82 |       assert( (movemask(final_extract_lower <= lower ) & movemask(m_valid)) == movemask(m_valid)); | 
| 83 |       assert( (movemask(final_extract_upper >= upper ) & movemask(m_valid)) == movemask(m_valid)); | 
| 84 | #endif | 
| 85 |     } | 
| 86 |      | 
| 87 |     __forceinline void init_dim(AABBNode_t<NodeRefPtr<N>,N>& node) | 
| 88 |     { | 
| 89 |       init_dim(node.lower_x,node.upper_x,lower_x,upper_x,start.x,scale.x); | 
| 90 |       init_dim(node.lower_y,node.upper_y,lower_y,upper_y,start.y,scale.y); | 
| 91 |       init_dim(node.lower_z,node.upper_z,lower_z,upper_z,start.z,scale.z); | 
| 92 |     } | 
| 93 |      | 
| 94 |     __forceinline vbool<N> validMask() const { return vint<N>::loadu(lower_x) <= vint<N>::loadu(upper_x); } | 
| 95 |      | 
| 96 | #if defined(__AVX512F__) // KNL | 
| 97 |     __forceinline vbool16 validMask16() const { return le(0xff,vint<16>::loadu(lower_x),vint<16>::loadu(upper_x)); } | 
| 98 | #endif | 
| 99 |     __forceinline vfloat<N> dequantizeLowerX() const { return madd(vfloat<N>(vint<N>::loadu(lower_x)),scale.x,vfloat<N>(start.x)); } | 
| 100 |      | 
| 101 |     __forceinline vfloat<N> dequantizeUpperX() const { return madd(vfloat<N>(vint<N>::loadu(upper_x)),scale.x,vfloat<N>(start.x)); } | 
| 102 |      | 
| 103 |     __forceinline vfloat<N> dequantizeLowerY() const { return madd(vfloat<N>(vint<N>::loadu(lower_y)),scale.y,vfloat<N>(start.y)); } | 
| 104 |      | 
| 105 |     __forceinline vfloat<N> dequantizeUpperY() const { return madd(vfloat<N>(vint<N>::loadu(upper_y)),scale.y,vfloat<N>(start.y)); } | 
| 106 |      | 
| 107 |     __forceinline vfloat<N> dequantizeLowerZ() const { return madd(vfloat<N>(vint<N>::loadu(lower_z)),scale.z,vfloat<N>(start.z)); } | 
| 108 |      | 
| 109 |     __forceinline vfloat<N> dequantizeUpperZ() const { return madd(vfloat<N>(vint<N>::loadu(upper_z)),scale.z,vfloat<N>(start.z)); } | 
| 110 |      | 
| 111 |     template <int M> | 
| 112 |       __forceinline vfloat<M> dequantize(const size_t offset) const { return vfloat<M>(vint<M>::loadu(all_planes+offset)); } | 
| 113 |      | 
| 114 | #if defined(__AVX512F__) | 
| 115 |     __forceinline vfloat16 dequantizeLowerUpperX(const vint16 &p) const { return madd(vfloat16(permute(vint<16>::loadu(lower_x),p)),scale.x,vfloat16(start.x)); } | 
| 116 |     __forceinline vfloat16 dequantizeLowerUpperY(const vint16 &p) const { return madd(vfloat16(permute(vint<16>::loadu(lower_y),p)),scale.y,vfloat16(start.y)); } | 
| 117 |     __forceinline vfloat16 dequantizeLowerUpperZ(const vint16 &p) const { return madd(vfloat16(permute(vint<16>::loadu(lower_z),p)),scale.z,vfloat16(start.z)); }       | 
| 118 | #endif | 
| 119 |      | 
| 120 |     union { | 
| 121 |       struct { | 
| 122 |         T lower_x[N]; //!< 8bit discretized X dimension of lower bounds of all N children | 
| 123 |         T upper_x[N]; //!< 8bit discretized X dimension of upper bounds of all N children | 
| 124 |         T lower_y[N]; //!< 8bit discretized Y dimension of lower bounds of all N children | 
| 125 |         T upper_y[N]; //!< 8bit discretized Y dimension of upper bounds of all N children | 
| 126 |         T lower_z[N]; //!< 8bit discretized Z dimension of lower bounds of all N children | 
| 127 |         T upper_z[N]; //!< 8bit discretized Z dimension of upper bounds of all N children | 
| 128 |       }; | 
| 129 |       T all_planes[6*N]; | 
| 130 |     }; | 
| 131 |      | 
| 132 |     Vec3f start; | 
| 133 |     Vec3f scale; | 
| 134 |      | 
| 135 |     friend embree_ostream operator<<(embree_ostream o, const QuantizedBaseNode_t& n) | 
| 136 |     { | 
| 137 |       o << "QuantizedBaseNode { "  << embree_endl; | 
| 138 |       o << "  start   "  << n.start << embree_endl; | 
| 139 |       o << "  scale   "  << n.scale << embree_endl; | 
| 140 |       o << "  lower_x "  << vuint<N>::loadu(n.lower_x) << embree_endl; | 
| 141 |       o << "  upper_x "  << vuint<N>::loadu(n.upper_x) << embree_endl; | 
| 142 |       o << "  lower_y "  << vuint<N>::loadu(n.lower_y) << embree_endl; | 
| 143 |       o << "  upper_y "  << vuint<N>::loadu(n.upper_y) << embree_endl; | 
| 144 |       o << "  lower_z "  << vuint<N>::loadu(n.lower_z) << embree_endl; | 
| 145 |       o << "  upper_z "  << vuint<N>::loadu(n.upper_z) << embree_endl; | 
| 146 |       o << "}"  << embree_endl; | 
| 147 |       return o; | 
| 148 |     } | 
| 149 |      | 
| 150 |   }; | 
| 151 |  | 
| 152 |   template<typename NodeRef, int N> | 
| 153 |     struct __aligned(8) QuantizedNode_t : public BaseNode_t<NodeRef, N>, QuantizedBaseNode_t<N> | 
| 154 |   { | 
| 155 |     using BaseNode_t<NodeRef,N>::children; | 
| 156 |     using QuantizedBaseNode_t<N>::lower_x; | 
| 157 |     using QuantizedBaseNode_t<N>::upper_x; | 
| 158 |     using QuantizedBaseNode_t<N>::lower_y; | 
| 159 |     using QuantizedBaseNode_t<N>::upper_y; | 
| 160 |     using QuantizedBaseNode_t<N>::lower_z; | 
| 161 |     using QuantizedBaseNode_t<N>::upper_z; | 
| 162 |     using QuantizedBaseNode_t<N>::start; | 
| 163 |     using QuantizedBaseNode_t<N>::scale; | 
| 164 |     using QuantizedBaseNode_t<N>::init_dim; | 
| 165 |      | 
| 166 |     __forceinline void setRef(size_t i, const NodeRef& ref) { | 
| 167 |       assert(i < N); | 
| 168 |       children[i] = ref; | 
| 169 |     } | 
| 170 |      | 
| 171 |     struct Create2 | 
| 172 |     { | 
| 173 |       template<typename BuildRecord> | 
| 174 |       __forceinline NodeRef operator() (BuildRecord* children, const size_t n, const FastAllocator::CachedAllocator& alloc) const | 
| 175 |       { | 
| 176 |         __aligned(64) AABBNode_t<NodeRef,N> node; | 
| 177 |         node.clear(); | 
| 178 |         for (size_t i=0; i<n; i++) { | 
| 179 |           node.setBounds(i,children[i].bounds()); | 
| 180 |         } | 
| 181 |         QuantizedNode_t *qnode = (QuantizedNode_t*) alloc.malloc0(bytes: sizeof(QuantizedNode_t), align: NodeRef::byteAlignment); | 
| 182 |         qnode->init(node); | 
| 183 |          | 
| 184 |         return (size_t)qnode | NodeRef::tyQuantizedNode; | 
| 185 |       } | 
| 186 |     }; | 
| 187 |      | 
| 188 |     struct Set2 | 
| 189 |     { | 
| 190 |       template<typename BuildRecord> | 
| 191 |       __forceinline NodeRef operator() (const BuildRecord& precord, const BuildRecord* crecords, NodeRef ref, NodeRef* children, const size_t num) const | 
| 192 |       { | 
| 193 | #if defined(DEBUG) | 
| 194 |         // check that empty children are only at the end of the child list | 
| 195 |         bool emptyChild = false; | 
| 196 |         for (size_t i=0; i<num; i++) { | 
| 197 |           emptyChild |= (children[i] == NodeRef::emptyNode); | 
| 198 |           assert(emptyChild == (children[i] == NodeRef::emptyNode)); | 
| 199 |         } | 
| 200 | #endif | 
| 201 |         QuantizedNode_t* node = ref.quantizedNode(); | 
| 202 |         for (size_t i=0; i<num; i++) node->setRef(i,children[i]); | 
| 203 |         return ref; | 
| 204 |       } | 
| 205 |     }; | 
| 206 |      | 
| 207 |     __forceinline void init(AABBNode_t<NodeRef,N>& node) | 
| 208 |     { | 
| 209 |       for (size_t i=0;i<N;i++) children[i] = NodeRef::emptyNode; | 
| 210 |       init_dim(node); | 
| 211 |     } | 
| 212 |      | 
| 213 |   };  | 
| 214 |    | 
| 215 |   /*! BVHN Quantized Node */ | 
| 216 |   template<int N> | 
| 217 |     struct __aligned(8) QuantizedBaseNodeMB_t | 
| 218 |   { | 
| 219 |     QuantizedBaseNode_t<N> node0; | 
| 220 |     QuantizedBaseNode_t<N> node1; | 
| 221 |      | 
| 222 |     /*! Clears the node. */ | 
| 223 |     __forceinline void clear() { | 
| 224 |       node0.clear(); | 
| 225 |       node1.clear(); | 
| 226 |     } | 
| 227 |      | 
| 228 |     /*! Returns bounds of specified child. */ | 
| 229 |     __forceinline BBox3fa bounds(size_t i) const | 
| 230 |     { | 
| 231 |       assert(i < N); | 
| 232 |       BBox3fa bounds0 = node0.bounds(i); | 
| 233 |       BBox3fa bounds1 = node1.bounds(i); | 
| 234 |       bounds0.extend(other: bounds1); | 
| 235 |       return bounds0; | 
| 236 |     } | 
| 237 |      | 
| 238 |     /*! Returns extent of bounds of specified child. */ | 
| 239 |     __forceinline Vec3fa extent(size_t i) const { | 
| 240 |       return bounds(i).size(); | 
| 241 |     } | 
| 242 |      | 
| 243 |     __forceinline vbool<N> validMask() const { return node0.validMask(); } | 
| 244 |      | 
| 245 |     template<typename T> | 
| 246 |       __forceinline vfloat<N> dequantizeLowerX(const T t) const { return lerp(node0.dequantizeLowerX(),node1.dequantizeLowerX(),t); } | 
| 247 |     template<typename T> | 
| 248 |       __forceinline vfloat<N> dequantizeUpperX(const T t) const { return lerp(node0.dequantizeUpperX(),node1.dequantizeUpperX(),t); } | 
| 249 |     template<typename T> | 
| 250 |       __forceinline vfloat<N> dequantizeLowerY(const T t) const { return lerp(node0.dequantizeLowerY(),node1.dequantizeLowerY(),t); } | 
| 251 |     template<typename T> | 
| 252 |       __forceinline vfloat<N> dequantizeUpperY(const T t) const { return lerp(node0.dequantizeUpperY(),node1.dequantizeUpperY(),t); } | 
| 253 |     template<typename T> | 
| 254 |       __forceinline vfloat<N> dequantizeLowerZ(const T t) const { return lerp(node0.dequantizeLowerZ(),node1.dequantizeLowerZ(),t); } | 
| 255 |     template<typename T> | 
| 256 |       __forceinline vfloat<N> dequantizeUpperZ(const T t) const { return lerp(node0.dequantizeUpperZ(),node1.dequantizeUpperZ(),t); } | 
| 257 |      | 
| 258 |      | 
| 259 |     template<int M> | 
| 260 |       __forceinline vfloat<M> dequantizeLowerX(const size_t i, const vfloat<M> &t) const { return lerp(vfloat<M>(node0.dequantizeLowerX()[i]),vfloat<M>(node1.dequantizeLowerX()[i]),t); } | 
| 261 |     template<int M> | 
| 262 |       __forceinline vfloat<M> dequantizeUpperX(const size_t i, const vfloat<M> &t) const { return lerp(vfloat<M>(node0.dequantizeUpperX()[i]),vfloat<M>(node1.dequantizeUpperX()[i]),t); } | 
| 263 |     template<int M> | 
| 264 |       __forceinline vfloat<M> dequantizeLowerY(const size_t i, const vfloat<M> &t) const { return lerp(vfloat<M>(node0.dequantizeLowerY()[i]),vfloat<M>(node1.dequantizeLowerY()[i]),t); } | 
| 265 |     template<int M> | 
| 266 |       __forceinline vfloat<M> dequantizeUpperY(const size_t i, const vfloat<M> &t) const { return lerp(vfloat<M>(node0.dequantizeUpperY()[i]),vfloat<M>(node1.dequantizeUpperY()[i]),t); } | 
| 267 |     template<int M> | 
| 268 |       __forceinline vfloat<M> dequantizeLowerZ(const size_t i, const vfloat<M> &t) const { return lerp(vfloat<M>(node0.dequantizeLowerZ()[i]),vfloat<M>(node1.dequantizeLowerZ()[i]),t); } | 
| 269 |     template<int M> | 
| 270 |       __forceinline vfloat<M> dequantizeUpperZ(const size_t i, const vfloat<M> &t) const { return lerp(vfloat<M>(node0.dequantizeUpperZ()[i]),vfloat<M>(node1.dequantizeUpperZ()[i]),t); } | 
| 271 |      | 
| 272 |   }; | 
| 273 | } | 
| 274 |  |