| 1 | // Copyright 2009-2021 Intel Corporation | 
| 2 | // SPDX-License-Identifier: Apache-2.0 | 
| 3 |  | 
| 4 | #pragma once | 
| 5 |  | 
| 6 | #include "../simd/simd.h" | 
| 7 | #include "parallel_for.h" | 
| 8 | #include <algorithm> | 
| 9 |  | 
| 10 | namespace embree | 
| 11 | { | 
| 12 |   template<class T> | 
| 13 |     __forceinline void insertionsort_ascending(T *__restrict__ array, const size_t length) | 
| 14 |   { | 
| 15 |     for(size_t i = 1;i<length;++i) | 
| 16 |     { | 
| 17 |       T v = array[i]; | 
| 18 |       size_t j = i; | 
| 19 |       while(j > 0 && v < array[j-1]) | 
| 20 |       { | 
| 21 |         array[j] = array[j-1]; | 
| 22 |         --j; | 
| 23 |       } | 
| 24 |       array[j] = v; | 
| 25 |     } | 
| 26 |   } | 
| 27 |    | 
| 28 |   template<class T> | 
| 29 |     __forceinline void insertionsort_decending(T *__restrict__ array, const size_t length) | 
| 30 |   { | 
| 31 |     for(size_t i = 1;i<length;++i) | 
| 32 |     { | 
| 33 |       T v = array[i]; | 
| 34 |       size_t j = i; | 
| 35 |       while(j > 0 && v > array[j-1]) | 
| 36 |       { | 
| 37 |         array[j] = array[j-1]; | 
| 38 |         --j; | 
| 39 |       } | 
| 40 |       array[j] = v; | 
| 41 |     } | 
| 42 |   } | 
| 43 |    | 
| 44 |   template<class T>  | 
| 45 |     void quicksort_ascending(T *__restrict__ t,  | 
| 46 | 			     const ssize_t begin,  | 
| 47 | 			     const ssize_t end) | 
| 48 |   { | 
| 49 |     if (likely(begin < end))  | 
| 50 |     {       | 
| 51 |       const T pivotvalue = t[begin]; | 
| 52 |       ssize_t left  = begin - 1; | 
| 53 |       ssize_t right = end   + 1; | 
| 54 |        | 
| 55 |       while(1)  | 
| 56 |       { | 
| 57 |         while (t[--right] > pivotvalue); | 
| 58 |         while (t[++left] < pivotvalue); | 
| 59 |          | 
| 60 |         if (left >= right) break; | 
| 61 |          | 
| 62 |         const T temp = t[right]; | 
| 63 |         t[right] = t[left]; | 
| 64 |         t[left] = temp; | 
| 65 |       } | 
| 66 |        | 
| 67 |       const int pivot = right; | 
| 68 |       quicksort_ascending(t, begin, pivot); | 
| 69 |       quicksort_ascending(t, pivot + 1, end); | 
| 70 |     } | 
| 71 |   } | 
| 72 |    | 
| 73 |   template<class T>  | 
| 74 |     void quicksort_decending(T *__restrict__ t,  | 
| 75 | 			     const ssize_t begin,  | 
| 76 | 			     const ssize_t end) | 
| 77 |   { | 
| 78 |     if (likely(begin < end))  | 
| 79 |     { | 
| 80 |       const T pivotvalue = t[begin]; | 
| 81 |       ssize_t left  = begin - 1; | 
| 82 |       ssize_t right = end   + 1; | 
| 83 |        | 
| 84 |       while(1)  | 
| 85 |       { | 
| 86 |         while (t[--right] < pivotvalue); | 
| 87 |         while (t[++left] > pivotvalue); | 
| 88 |          | 
| 89 |         if (left >= right) break; | 
| 90 |          | 
| 91 |         const T temp = t[right]; | 
| 92 |         t[right] = t[left]; | 
| 93 |         t[left] = temp; | 
| 94 |       } | 
| 95 |        | 
| 96 |       const int pivot = right; | 
| 97 |       quicksort_decending(t, begin, pivot); | 
| 98 |       quicksort_decending(t, pivot + 1, end); | 
| 99 |     } | 
| 100 |   } | 
| 101 |    | 
| 102 |    | 
| 103 |   template<class T, ssize_t THRESHOLD>  | 
| 104 |     void quicksort_insertionsort_ascending(T *__restrict__ t,  | 
| 105 | 					   const ssize_t begin,  | 
| 106 | 					   const ssize_t end) | 
| 107 |   { | 
| 108 |     if (likely(begin < end))  | 
| 109 |     {       | 
| 110 |       const ssize_t size = end-begin+1; | 
| 111 |       if (likely(size <= THRESHOLD)) | 
| 112 |       { | 
| 113 |         insertionsort_ascending<T>(&t[begin],size); | 
| 114 |       } | 
| 115 |       else | 
| 116 |       { | 
| 117 |         const T pivotvalue = t[begin]; | 
| 118 |         ssize_t left  = begin - 1; | 
| 119 |         ssize_t right = end   + 1; | 
| 120 |          | 
| 121 |         while(1)  | 
| 122 |         { | 
| 123 |           while (t[--right] > pivotvalue); | 
| 124 |           while (t[++left] < pivotvalue); | 
| 125 |            | 
| 126 |           if (left >= right) break; | 
| 127 |            | 
| 128 |           const T temp = t[right]; | 
| 129 |           t[right] = t[left]; | 
| 130 |           t[left] = temp; | 
| 131 |         } | 
| 132 |          | 
| 133 |         const ssize_t pivot = right; | 
| 134 |         quicksort_insertionsort_ascending<T,THRESHOLD>(t, begin, pivot); | 
| 135 |         quicksort_insertionsort_ascending<T,THRESHOLD>(t, pivot + 1, end); | 
| 136 |       } | 
| 137 |     } | 
| 138 |   } | 
| 139 |    | 
| 140 |    | 
| 141 |   template<class T, ssize_t THRESHOLD>  | 
| 142 |     void quicksort_insertionsort_decending(T *__restrict__ t,  | 
| 143 | 					   const ssize_t begin,  | 
| 144 | 					   const ssize_t end) | 
| 145 |   { | 
| 146 |     if (likely(begin < end))  | 
| 147 |     { | 
| 148 |       const ssize_t size = end-begin+1; | 
| 149 |       if (likely(size <= THRESHOLD)) | 
| 150 |       { | 
| 151 |         insertionsort_decending<T>(&t[begin],size); | 
| 152 |       } | 
| 153 |       else | 
| 154 |       { | 
| 155 |          | 
| 156 |         const T pivotvalue = t[begin]; | 
| 157 |         ssize_t left  = begin - 1; | 
| 158 |         ssize_t right = end   + 1; | 
| 159 |          | 
| 160 |         while(1)  | 
| 161 |         { | 
| 162 |           while (t[--right] < pivotvalue); | 
| 163 |           while (t[++left] > pivotvalue); | 
| 164 |            | 
| 165 |           if (left >= right) break; | 
| 166 |            | 
| 167 |           const T temp = t[right]; | 
| 168 |           t[right] = t[left]; | 
| 169 |           t[left] = temp; | 
| 170 |         } | 
| 171 |          | 
| 172 |         const ssize_t pivot = right; | 
| 173 |         quicksort_insertionsort_decending<T,THRESHOLD>(t, begin, pivot); | 
| 174 |         quicksort_insertionsort_decending<T,THRESHOLD>(t, pivot + 1, end); | 
| 175 |       } | 
| 176 |     } | 
| 177 |   } | 
| 178 |    | 
| 179 |   template<typename T> | 
| 180 |     static void radixsort32(T* const morton, const size_t num, const unsigned int shift = 3*8) | 
| 181 |   { | 
| 182 |     static const unsigned int BITS = 8; | 
| 183 |     static const unsigned int BUCKETS = (1 << BITS); | 
| 184 |     static const unsigned int CMP_SORT_THRESHOLD = 16; | 
| 185 |      | 
| 186 |     __aligned(64) unsigned int count[BUCKETS]; | 
| 187 |      | 
| 188 |     /* clear buckets */ | 
| 189 |     for (size_t i=0;i<BUCKETS;i++) count[i] = 0; | 
| 190 |      | 
| 191 |     /* count buckets */ | 
| 192 | #if defined(__INTEL_COMPILER) | 
| 193 | #pragma nounroll | 
| 194 | #endif | 
| 195 |     for (size_t i=0;i<num;i++) | 
| 196 |       count[(unsigned(morton[i]) >> shift) & (BUCKETS-1)]++; | 
| 197 |      | 
| 198 |     /* prefix sums */ | 
| 199 |     __aligned(64) unsigned int head[BUCKETS]; | 
| 200 |     __aligned(64) unsigned int tail[BUCKETS]; | 
| 201 |      | 
| 202 |     head[0] = 0; | 
| 203 |     for (size_t i=1; i<BUCKETS; i++)     | 
| 204 |       head[i] = head[i-1] + count[i-1]; | 
| 205 |      | 
| 206 |     for (size_t i=0; i<BUCKETS-1; i++)     | 
| 207 |       tail[i] = head[i+1]; | 
| 208 |      | 
| 209 |     tail[BUCKETS-1] = head[BUCKETS-1] + count[BUCKETS-1]; | 
| 210 |      | 
| 211 |     assert(tail[BUCKETS-1] == head[BUCKETS-1] + count[BUCKETS-1]);       | 
| 212 |     assert(tail[BUCKETS-1] == num);       | 
| 213 |      | 
| 214 |     /* in-place swap */       | 
| 215 |     for (size_t i=0;i<BUCKETS;i++) | 
| 216 |     { | 
| 217 |       /* process bucket */ | 
| 218 |       while(head[i] < tail[i]) | 
| 219 |       { | 
| 220 |         T v = morton[head[i]]; | 
| 221 |         while(1) | 
| 222 |         { | 
| 223 |           const size_t b = (unsigned(v) >> shift) & (BUCKETS-1); | 
| 224 |           if (b == i) break; | 
| 225 |           std::swap(v,morton[head[b]++]); | 
| 226 |         } | 
| 227 |         assert((unsigned(v) >> shift & (BUCKETS-1)) == i); | 
| 228 |         morton[head[i]++] = v; | 
| 229 |       } | 
| 230 |     } | 
| 231 |     if (shift == 0) return; | 
| 232 |      | 
| 233 |     size_t offset = 0; | 
| 234 |     for (size_t i=0;i<BUCKETS;i++) | 
| 235 |       if (count[i]) | 
| 236 |       { | 
| 237 |          | 
| 238 |         for (size_t j=offset;j<offset+count[i]-1;j++) | 
| 239 |           assert(((unsigned(morton[j]) >> shift) & (BUCKETS-1)) == i); | 
| 240 |          | 
| 241 |         if (unlikely(count[i] < CMP_SORT_THRESHOLD)) | 
| 242 |           insertionsort_ascending(morton + offset, count[i]); | 
| 243 |         else | 
| 244 |           radixsort32(morton + offset, count[i], shift-BITS); | 
| 245 |          | 
| 246 |         for (size_t j=offset;j<offset+count[i]-1;j++) | 
| 247 |           assert(morton[j] <= morton[j+1]); | 
| 248 |          | 
| 249 |         offset += count[i]; | 
| 250 |       }       | 
| 251 |   }     | 
| 252 |  | 
| 253 |   template<typename Ty, typename Key> | 
| 254 |     class ParallelRadixSort | 
| 255 |   { | 
| 256 |     static const size_t MAX_TASKS = 64; | 
| 257 |     static const size_t BITS = 8; | 
| 258 |     static const size_t BUCKETS = (1 << BITS); | 
| 259 |     typedef unsigned int TyRadixCount[BUCKETS]; | 
| 260 |      | 
| 261 |     template<typename T> | 
| 262 |       static bool compare(const T& v0, const T& v1) { | 
| 263 |       return (Key)v0 < (Key)v1; | 
| 264 |     } | 
| 265 |  | 
| 266 |   private: | 
| 267 |     ParallelRadixSort (const ParallelRadixSort& other) DELETED; // do not implement | 
| 268 |     ParallelRadixSort& operator= (const ParallelRadixSort& other) DELETED; // do not implement | 
| 269 |  | 
| 270 |      | 
| 271 |   public: | 
| 272 |     ParallelRadixSort (Ty* const src, Ty* const tmp, const size_t N) | 
| 273 |       : radixCount(nullptr), src(src), tmp(tmp), N(N) {} | 
| 274 |  | 
| 275 |     void sort(const size_t blockSize) | 
| 276 |     { | 
| 277 |       assert(blockSize > 0); | 
| 278 |        | 
| 279 |       /* perform single threaded sort for small N */ | 
| 280 |       if (N<=blockSize) // handles also special case of 0! | 
| 281 |       {	   | 
| 282 |         /* do inplace sort inside destination array */ | 
| 283 |         std::sort(src,src+N,compare<Ty>); | 
| 284 |       } | 
| 285 |        | 
| 286 |       /* perform parallel sort for large N */ | 
| 287 |       else  | 
| 288 |       { | 
| 289 |         const size_t numThreads = min(a: (N+blockSize-1)/blockSize,b: TaskScheduler::threadCount(),c: size_t(MAX_TASKS)); | 
| 290 |         tbbRadixSort(numTasks: numThreads); | 
| 291 |       } | 
| 292 |     } | 
| 293 |  | 
| 294 |     ~ParallelRadixSort() | 
| 295 |     { | 
| 296 |       alignedFree(ptr: radixCount);  | 
| 297 |       radixCount = nullptr; | 
| 298 |     } | 
| 299 |      | 
| 300 |   private: | 
| 301 |      | 
| 302 |     void tbbRadixIteration0(const Key shift,  | 
| 303 |                             const Ty* __restrict const src,  | 
| 304 |                             Ty* __restrict const dst,  | 
| 305 |                             const size_t threadIndex, const size_t threadCount) | 
| 306 |     { | 
| 307 |       const size_t startID = (threadIndex+0)*N/threadCount; | 
| 308 |       const size_t endID   = (threadIndex+1)*N/threadCount; | 
| 309 |        | 
| 310 |       /* mask to extract some number of bits */ | 
| 311 |       const Key mask = BUCKETS-1; | 
| 312 |        | 
| 313 |       /* count how many items go into the buckets */ | 
| 314 |       for (size_t i=0; i<BUCKETS; i++) | 
| 315 |         radixCount[threadIndex][i] = 0; | 
| 316 |  | 
| 317 |       /* iterate over src array and count buckets */ | 
| 318 |       unsigned int * __restrict const count = radixCount[threadIndex]; | 
| 319 | #if defined(__INTEL_COMPILER) | 
| 320 | #pragma nounroll       | 
| 321 | #endif | 
| 322 |       for (size_t i=startID; i<endID; i++) { | 
| 323 | #if defined(__64BIT__) | 
| 324 |         const size_t index = ((size_t)(Key)src[i] >> (size_t)shift) & (size_t)mask; | 
| 325 | #else | 
| 326 |         const Key index = ((Key)src[i] >> shift) & mask; | 
| 327 | #endif | 
| 328 |         count[index]++; | 
| 329 |       } | 
| 330 |     } | 
| 331 |      | 
| 332 |     void tbbRadixIteration1(const Key shift,  | 
| 333 |                             const Ty* __restrict const src,  | 
| 334 |                             Ty* __restrict const dst,  | 
| 335 |                             const size_t threadIndex, const size_t threadCount) | 
| 336 |     { | 
| 337 |       const size_t startID = (threadIndex+0)*N/threadCount; | 
| 338 |       const size_t endID   = (threadIndex+1)*N/threadCount; | 
| 339 |        | 
| 340 |       /* mask to extract some number of bits */ | 
| 341 |       const Key mask = BUCKETS-1; | 
| 342 |        | 
| 343 |       /* calculate total number of items for each bucket */ | 
| 344 |       __aligned(64) unsigned int total[BUCKETS]; | 
| 345 |       /* | 
| 346 |       for (size_t i=0; i<BUCKETS; i++) | 
| 347 |         total[i] = 0; | 
| 348 |       */ | 
| 349 |       for (size_t i=0; i<BUCKETS; i+=VSIZEX) | 
| 350 |         vintx::store(ptr: &total[i], v: zero); | 
| 351 |        | 
| 352 |       for (size_t i=0; i<threadCount; i++) | 
| 353 |       { | 
| 354 |         /* | 
| 355 |         for (size_t j=0; j<BUCKETS; j++) | 
| 356 |           total[j] += radixCount[i][j]; | 
| 357 |         */ | 
| 358 |         for (size_t j=0; j<BUCKETS; j+=VSIZEX) | 
| 359 |           vintx::store(ptr: &total[j], v: vintx::load(a: &total[j]) + vintx::load(a: &radixCount[i][j])); | 
| 360 |       } | 
| 361 |        | 
| 362 |       /* calculate start offset of each bucket */ | 
| 363 |       __aligned(64) unsigned int offset[BUCKETS]; | 
| 364 |       offset[0] = 0; | 
| 365 |       for (size_t i=1; i<BUCKETS; i++)     | 
| 366 |         offset[i] = offset[i-1] + total[i-1]; | 
| 367 |        | 
| 368 |       /* calculate start offset of each bucket for this thread */ | 
| 369 |       for (size_t i=0; i<threadIndex; i++) | 
| 370 |       { | 
| 371 |         /* | 
| 372 |         for (size_t j=0; j<BUCKETS; j++) | 
| 373 |           offset[j] += radixCount[i][j]; | 
| 374 |         */ | 
| 375 |         for (size_t j=0; j<BUCKETS; j+=VSIZEX) | 
| 376 |           vintx::store(ptr: &offset[j], v: vintx::load(a: &offset[j]) + vintx::load(a: &radixCount[i][j])); | 
| 377 |       } | 
| 378 |        | 
| 379 |       /* copy items into their buckets */ | 
| 380 | #if defined(__INTEL_COMPILER) | 
| 381 | #pragma nounroll | 
| 382 | #endif | 
| 383 |       for (size_t i=startID; i<endID; i++) { | 
| 384 |         const Ty elt = src[i]; | 
| 385 | #if defined(__64BIT__) | 
| 386 |         const size_t index = ((size_t)(Key)src[i] >> (size_t)shift) & (size_t)mask; | 
| 387 | #else | 
| 388 |         const size_t index = ((Key)src[i] >> shift) & mask; | 
| 389 | #endif | 
| 390 |         dst[offset[index]++] = elt; | 
| 391 |       } | 
| 392 |     } | 
| 393 |      | 
| 394 |     void tbbRadixIteration(const Key shift, const bool last, | 
| 395 |                            const Ty* __restrict src, Ty* __restrict dst, | 
| 396 |                            const size_t numTasks) | 
| 397 |     { | 
| 398 |       affinity_partitioner ap; | 
| 399 |       parallel_for_affinity(numTasks,[&] (size_t taskIndex) { tbbRadixIteration0(shift,src,dst,threadIndex: taskIndex,threadCount: numTasks); },ap); | 
| 400 |       parallel_for_affinity(numTasks,[&] (size_t taskIndex) { tbbRadixIteration1(shift,src,dst,threadIndex: taskIndex,threadCount: numTasks); },ap); | 
| 401 |     } | 
| 402 |      | 
| 403 |     void tbbRadixSort(const size_t numTasks) | 
| 404 |     { | 
| 405 |       radixCount = (TyRadixCount*) alignedMalloc(size: MAX_TASKS*sizeof(TyRadixCount),align: 64); | 
| 406 |        | 
| 407 |       if (sizeof(Key) == sizeof(uint32_t)) { | 
| 408 |         tbbRadixIteration(shift: 0*BITS,last: 0,src,dst: tmp,numTasks); | 
| 409 |         tbbRadixIteration(shift: 1*BITS,last: 0,src: tmp,dst: src,numTasks); | 
| 410 |         tbbRadixIteration(shift: 2*BITS,last: 0,src,dst: tmp,numTasks); | 
| 411 |         tbbRadixIteration(shift: 3*BITS,last: 1,src: tmp,dst: src,numTasks); | 
| 412 |       } | 
| 413 |       else if (sizeof(Key) == sizeof(uint64_t)) | 
| 414 |       { | 
| 415 |         tbbRadixIteration(shift: 0*BITS,last: 0,src,dst: tmp,numTasks); | 
| 416 |         tbbRadixIteration(shift: 1*BITS,last: 0,src: tmp,dst: src,numTasks); | 
| 417 |         tbbRadixIteration(shift: 2*BITS,last: 0,src,dst: tmp,numTasks); | 
| 418 |         tbbRadixIteration(shift: 3*BITS,last: 0,src: tmp,dst: src,numTasks); | 
| 419 |         tbbRadixIteration(shift: 4*BITS,last: 0,src,dst: tmp,numTasks); | 
| 420 |         tbbRadixIteration(shift: 5*BITS,last: 0,src: tmp,dst: src,numTasks); | 
| 421 |         tbbRadixIteration(shift: 6*BITS,last: 0,src,dst: tmp,numTasks); | 
| 422 |         tbbRadixIteration(shift: 7*BITS,last: 1,src: tmp,dst: src,numTasks); | 
| 423 |       } | 
| 424 |     } | 
| 425 |      | 
| 426 |   private: | 
| 427 |     TyRadixCount* radixCount; | 
| 428 |     Ty* const src; | 
| 429 |     Ty* const tmp; | 
| 430 |     const size_t N; | 
| 431 |   }; | 
| 432 |  | 
| 433 |   template<typename Ty> | 
| 434 |     void radix_sort(Ty* const src, Ty* const tmp, const size_t N, const size_t blockSize = 8192) | 
| 435 |   { | 
| 436 |     ParallelRadixSort<Ty,Ty>(src,tmp,N).sort(blockSize); | 
| 437 |   } | 
| 438 |    | 
| 439 |   template<typename Ty, typename Key> | 
| 440 |     void radix_sort(Ty* const src, Ty* const tmp, const size_t N, const size_t blockSize = 8192) | 
| 441 |   { | 
| 442 |     ParallelRadixSort<Ty,Key>(src,tmp,N).sort(blockSize); | 
| 443 |   } | 
| 444 |    | 
| 445 |   template<typename Ty> | 
| 446 |     void radix_sort_u32(Ty* const src, Ty* const tmp, const size_t N, const size_t blockSize = 8192) { | 
| 447 |     radix_sort<Ty,uint32_t>(src,tmp,N,blockSize); | 
| 448 |   } | 
| 449 |    | 
| 450 |   template<typename Ty> | 
| 451 |     void radix_sort_u64(Ty* const src, Ty* const tmp, const size_t N, const size_t blockSize = 8192) { | 
| 452 |     radix_sort<Ty,uint64_t>(src,tmp,N,blockSize); | 
| 453 |   } | 
| 454 | } | 
| 455 |  |