| 1 | // Copyright 2009-2021 Intel Corporation | 
| 2 | // SPDX-License-Identifier: Apache-2.0 | 
| 3 |  | 
| 4 | #pragma once | 
| 5 |  | 
| 6 | #include "parallel_for.h" | 
| 7 | #include "../math/range.h" | 
| 8 |  | 
| 9 | namespace embree | 
| 10 | { | 
| 11 |   /* serial partitioning */ | 
| 12 |   template<typename T, typename V, typename IsLeft, typename Reduction_T> | 
| 13 |     __forceinline size_t serial_partitioning(T* array,  | 
| 14 |                                              const size_t begin, | 
| 15 |                                              const size_t end,  | 
| 16 |                                              V& leftReduction, | 
| 17 |                                              V& rightReduction, | 
| 18 |                                              const IsLeft& is_left,  | 
| 19 |                                              const Reduction_T& reduction_t) | 
| 20 |   { | 
| 21 |     T* l = array + begin; | 
| 22 |     T* r = array + end - 1; | 
| 23 |      | 
| 24 |     while(1) | 
| 25 |     { | 
| 26 |       /* *l < pivot */ | 
| 27 |       while (likely(l <= r && is_left(*l) ))  | 
| 28 |       { | 
| 29 |         //prefetchw(l+4); // FIXME: enable? | 
| 30 |         reduction_t(leftReduction,*l); | 
| 31 |         ++l; | 
| 32 |       } | 
| 33 |       /* *r >= pivot) */ | 
| 34 |       while (likely(l <= r && !is_left(*r))) | 
| 35 |       { | 
| 36 |         //prefetchw(r-4); FIXME: enable? | 
| 37 |         reduction_t(rightReduction,*r); | 
| 38 |         --r; | 
| 39 |       } | 
| 40 |       if (r<l) break; | 
| 41 |        | 
| 42 |       reduction_t(leftReduction ,*r); | 
| 43 |       reduction_t(rightReduction,*l); | 
| 44 |       xchg(*l,*r); | 
| 45 |       l++; r--; | 
| 46 |     } | 
| 47 |      | 
| 48 |     return l - array;         | 
| 49 |   } | 
| 50 |  | 
| 51 |   template<typename T, typename V, typename Vi, typename IsLeft, typename Reduction_T, typename Reduction_V> | 
| 52 |     class __aligned(64) parallel_partition_task | 
| 53 |   { | 
| 54 |     ALIGNED_CLASS_(64); | 
| 55 |   private: | 
| 56 |  | 
| 57 |     static const size_t MAX_TASKS = 64; | 
| 58 |  | 
| 59 |     T* array; | 
| 60 |     size_t N; | 
| 61 |     const IsLeft& is_left; | 
| 62 |     const Reduction_T& reduction_t; | 
| 63 |     const Reduction_V& reduction_v; | 
| 64 |     const Vi& identity; | 
| 65 |  | 
| 66 |     size_t numTasks;  | 
| 67 |     __aligned(64) size_t counter_start[MAX_TASKS+1];  | 
| 68 |     __aligned(64) size_t counter_left[MAX_TASKS+1];   | 
| 69 |     __aligned(64) range<ssize_t> leftMisplacedRanges[MAX_TASKS];   | 
| 70 |     __aligned(64) range<ssize_t> rightMisplacedRanges[MAX_TASKS];  | 
| 71 |     __aligned(64) V leftReductions[MAX_TASKS];            | 
| 72 |     __aligned(64) V rightReductions[MAX_TASKS];     | 
| 73 |  | 
| 74 |   public: | 
| 75 |       | 
| 76 |     __forceinline parallel_partition_task(T* array,  | 
| 77 |                                           const size_t N,  | 
| 78 |                                           const Vi& identity,  | 
| 79 |                                           const IsLeft& is_left,  | 
| 80 |                                           const Reduction_T& reduction_t,  | 
| 81 |                                           const Reduction_V& reduction_v, | 
| 82 |                                           const size_t BLOCK_SIZE)  | 
| 83 |  | 
| 84 |       : array(array), N(N), is_left(is_left), reduction_t(reduction_t), reduction_v(reduction_v), identity(identity), | 
| 85 |       numTasks(min(a: (N+BLOCK_SIZE-1)/BLOCK_SIZE,b: min(a: TaskScheduler::threadCount(),b: MAX_TASKS))) {} | 
| 86 |  | 
| 87 |     __forceinline const range<ssize_t>* findStartRange(size_t& index, const range<ssize_t>* const r, const size_t numRanges) | 
| 88 |     { | 
| 89 |       size_t i = 0; | 
| 90 |       while(index >= (size_t)r[i].size()) | 
| 91 |       { | 
| 92 |         assert(i < numRanges); | 
| 93 |         index -= (size_t)r[i].size(); | 
| 94 |         i++; | 
| 95 |       }	     | 
| 96 |       return &r[i]; | 
| 97 |     } | 
| 98 |  | 
| 99 |     __forceinline void swapItemsInMisplacedRanges(const size_t numLeftMisplacedRanges, | 
| 100 |                                                   const size_t numRightMisplacedRanges, | 
| 101 |                                                   const size_t startID, | 
| 102 |                                                   const size_t endID) | 
| 103 |     { | 
| 104 |       size_t leftLocalIndex  = startID; | 
| 105 |       size_t rightLocalIndex = startID; | 
| 106 |       const range<ssize_t>* l_range = findStartRange(index&: leftLocalIndex,r: leftMisplacedRanges,numRanges: numLeftMisplacedRanges); | 
| 107 |       const range<ssize_t>* r_range = findStartRange(index&: rightLocalIndex,r: rightMisplacedRanges,numRanges: numRightMisplacedRanges); | 
| 108 |        | 
| 109 |       size_t l_left = l_range->size() - leftLocalIndex; | 
| 110 |       size_t r_left = r_range->size() - rightLocalIndex; | 
| 111 |       T *__restrict__ l = &array[l_range->begin() + leftLocalIndex]; | 
| 112 |       T *__restrict__ r = &array[r_range->begin() + rightLocalIndex]; | 
| 113 |       size_t size = endID - startID; | 
| 114 |       size_t items = min(a: size,b: min(a: l_left,b: r_left));  | 
| 115 |       | 
| 116 |       while (size) | 
| 117 |       { | 
| 118 |         if (unlikely(l_left == 0)) | 
| 119 |         { | 
| 120 |           l_range++; | 
| 121 |           l_left = l_range->size(); | 
| 122 |           l = &array[l_range->begin()]; | 
| 123 |           items = min(a: size,b: min(a: l_left,b: r_left)); | 
| 124 |         } | 
| 125 |  | 
| 126 |         if (unlikely(r_left == 0)) | 
| 127 |         {		 | 
| 128 |           r_range++; | 
| 129 |           r_left = r_range->size(); | 
| 130 |           r = &array[r_range->begin()];           | 
| 131 |           items = min(a: size,b: min(a: l_left,b: r_left)); | 
| 132 |         } | 
| 133 |  | 
| 134 |         size   -= items; | 
| 135 |         l_left -= items; | 
| 136 |         r_left -= items; | 
| 137 |  | 
| 138 |         while(items) { | 
| 139 |           items--; | 
| 140 |           xchg(*l++,*r++); | 
| 141 |         } | 
| 142 |       } | 
| 143 |     } | 
| 144 |  | 
| 145 |     __forceinline size_t partition(V& leftReduction, V& rightReduction) | 
| 146 |     { | 
| 147 |       /* partition the individual ranges for each task */ | 
| 148 |       parallel_for(numTasks,[&] (const size_t taskID) { | 
| 149 |           const size_t startID = (taskID+0)*N/numTasks; | 
| 150 |           const size_t endID   = (taskID+1)*N/numTasks; | 
| 151 |           V local_left(identity); | 
| 152 |           V local_right(identity); | 
| 153 |           const size_t mid = serial_partitioning(array,startID,endID,local_left,local_right,is_left,reduction_t); | 
| 154 |           counter_start[taskID] = startID; | 
| 155 |           counter_left [taskID] = mid-startID; | 
| 156 |           leftReductions[taskID]  = local_left; | 
| 157 |           rightReductions[taskID] = local_right; | 
| 158 |         }); | 
| 159 |       counter_start[numTasks] = N; | 
| 160 |       counter_left[numTasks]  = 0; | 
| 161 |        | 
| 162 |       /* finalize the reductions */ | 
| 163 |       for (size_t i=0; i<numTasks; i++) { | 
| 164 |         reduction_v(leftReduction,leftReductions[i]); | 
| 165 |         reduction_v(rightReduction,rightReductions[i]); | 
| 166 |       } | 
| 167 |  | 
| 168 |       /* calculate mid point for partitioning */ | 
| 169 |       size_t mid = counter_left[0]; | 
| 170 |       for (size_t i=1; i<numTasks; i++) | 
| 171 |         mid += counter_left[i]; | 
| 172 |       const range<ssize_t> globalLeft (0,mid); | 
| 173 |       const range<ssize_t> globalRight(mid,N); | 
| 174 |  | 
| 175 |       /* calculate all left and right ranges that are on the wrong global side */ | 
| 176 |       size_t numMisplacedRangesLeft  = 0; | 
| 177 |       size_t numMisplacedRangesRight = 0; | 
| 178 |       size_t numMisplacedItemsLeft   = 0; | 
| 179 |       size_t numMisplacedItemsRight  = 0; | 
| 180 |  | 
| 181 |       for (size_t i=0; i<numTasks; i++) | 
| 182 |       {	     | 
| 183 |         const range<ssize_t> left_range (counter_start[i], counter_start[i] + counter_left[i]); | 
| 184 |         const range<ssize_t> right_range(counter_start[i] + counter_left[i], counter_start[i+1]); | 
| 185 |         const range<ssize_t> left_misplaced  = globalLeft. intersect(r: right_range); | 
| 186 |         const range<ssize_t> right_misplaced = globalRight.intersect(r: left_range); | 
| 187 |  | 
| 188 |         if (!left_misplaced.empty())   | 
| 189 |         { | 
| 190 |           numMisplacedItemsLeft += left_misplaced.size(); | 
| 191 |           leftMisplacedRanges[numMisplacedRangesLeft++] = left_misplaced; | 
| 192 |         } | 
| 193 |  | 
| 194 |         if (!right_misplaced.empty())  | 
| 195 |         { | 
| 196 |           numMisplacedItemsRight += right_misplaced.size(); | 
| 197 |           rightMisplacedRanges[numMisplacedRangesRight++] = right_misplaced; | 
| 198 |         } | 
| 199 |       } | 
| 200 |       assert( numMisplacedItemsLeft == numMisplacedItemsRight ); | 
| 201 |  | 
| 202 |       /* if no items are misplaced we are done */ | 
| 203 |       if (numMisplacedItemsLeft == 0) | 
| 204 |         return mid; | 
| 205 |  | 
| 206 |       /* otherwise we copy the items to the right place in parallel */ | 
| 207 |       parallel_for(numTasks,[&] (const size_t taskID) { | 
| 208 |           const size_t startID = (taskID+0)*numMisplacedItemsLeft/numTasks; | 
| 209 |           const size_t endID   = (taskID+1)*numMisplacedItemsLeft/numTasks; | 
| 210 |           swapItemsInMisplacedRanges(numLeftMisplacedRanges: numMisplacedRangesLeft,numRightMisplacedRanges: numMisplacedRangesRight,startID,endID);	                              | 
| 211 |         }); | 
| 212 |  | 
| 213 |       return mid; | 
| 214 |     } | 
| 215 |   }; | 
| 216 |  | 
| 217 |   template<typename T, typename V, typename Vi, typename IsLeft, typename Reduction_T, typename Reduction_V> | 
| 218 |     __noinline size_t parallel_partitioning(T* array,  | 
| 219 |                                             const size_t begin, | 
| 220 |                                             const size_t end,  | 
| 221 |                                             const Vi &identity, | 
| 222 |                                             V &leftReduction, | 
| 223 |                                             V &rightReduction, | 
| 224 |                                             const IsLeft& is_left,  | 
| 225 |                                             const Reduction_T& reduction_t, | 
| 226 |                                             const Reduction_V& reduction_v, | 
| 227 |                                             size_t BLOCK_SIZE = 128) | 
| 228 |   { | 
| 229 |     /* fall back to single threaded partitioning for small N */ | 
| 230 |     if (unlikely(end-begin < BLOCK_SIZE)) | 
| 231 |       return serial_partitioning(array,begin,end,leftReduction,rightReduction,is_left,reduction_t); | 
| 232 |  | 
| 233 |     /* otherwise use parallel code */ | 
| 234 |     else { | 
| 235 |       typedef parallel_partition_task<T,V,Vi,IsLeft,Reduction_T,Reduction_V> partition_task; | 
| 236 |       std::unique_ptr<partition_task> p(new partition_task(&array[begin],end-begin,identity,is_left,reduction_t,reduction_v,BLOCK_SIZE)); | 
| 237 |       return begin+p->partition(leftReduction,rightReduction);     | 
| 238 |     } | 
| 239 |   } | 
| 240 |  | 
| 241 |   template<typename T, typename V, typename Vi, typename IsLeft, typename Reduction_T, typename Reduction_V> | 
| 242 |     __noinline size_t parallel_partitioning(T* array,  | 
| 243 |                                             const size_t begin, | 
| 244 |                                             const size_t end,  | 
| 245 |                                             const Vi &identity, | 
| 246 |                                             V &leftReduction, | 
| 247 |                                             V &rightReduction, | 
| 248 |                                             const IsLeft& is_left,  | 
| 249 |                                             const Reduction_T& reduction_t, | 
| 250 |                                             const Reduction_V& reduction_v, | 
| 251 |                                             size_t BLOCK_SIZE, | 
| 252 |                                             size_t PARALLEL_THRESHOLD) | 
| 253 |   { | 
| 254 |     /* fall back to single threaded partitioning for small N */ | 
| 255 |     if (unlikely(end-begin < PARALLEL_THRESHOLD)) | 
| 256 |       return serial_partitioning(array,begin,end,leftReduction,rightReduction,is_left,reduction_t); | 
| 257 |  | 
| 258 |     /* otherwise use parallel code */ | 
| 259 |     else { | 
| 260 |       typedef parallel_partition_task<T,V,Vi,IsLeft,Reduction_T,Reduction_V> partition_task; | 
| 261 |       std::unique_ptr<partition_task> p(new partition_task(&array[begin],end-begin,identity,is_left,reduction_t,reduction_v,BLOCK_SIZE)); | 
| 262 |       return begin+p->partition(leftReduction,rightReduction);     | 
| 263 |     } | 
| 264 |   } | 
| 265 |  | 
| 266 |  | 
| 267 |   template<typename T, typename IsLeft> | 
| 268 |     inline size_t parallel_partitioning(T* array,  | 
| 269 |                                         const size_t begin, | 
| 270 |                                         const size_t end,  | 
| 271 |                                         const IsLeft& is_left,  | 
| 272 |                                         size_t BLOCK_SIZE = 128) | 
| 273 |   { | 
| 274 |     size_t leftReduction = 0; | 
| 275 |     size_t rightReduction = 0; | 
| 276 |     return parallel_partitioning( | 
| 277 |       array,begin,end,0,leftReduction,rightReduction,is_left, | 
| 278 |       [] (size_t& t,const T& ref) {  }, | 
| 279 |       [] (size_t& t0,size_t& t1) { }, | 
| 280 |       BLOCK_SIZE); | 
| 281 |   } | 
| 282 |  | 
| 283 | } | 
| 284 |  |