1// Copyright 2009-2021 Intel Corporation
2// SPDX-License-Identifier: Apache-2.0
3
4#pragma once
5
6#include "parallel_for.h"
7#include "../math/range.h"
8
9namespace embree
10{
11 /* serial partitioning */
12 template<typename T, typename V, typename IsLeft, typename Reduction_T>
13 __forceinline size_t serial_partitioning(T* array,
14 const size_t begin,
15 const size_t end,
16 V& leftReduction,
17 V& rightReduction,
18 const IsLeft& is_left,
19 const Reduction_T& reduction_t)
20 {
21 T* l = array + begin;
22 T* r = array + end - 1;
23
24 while(1)
25 {
26 /* *l < pivot */
27 while (likely(l <= r && is_left(*l) ))
28 {
29 //prefetchw(l+4); // FIXME: enable?
30 reduction_t(leftReduction,*l);
31 ++l;
32 }
33 /* *r >= pivot) */
34 while (likely(l <= r && !is_left(*r)))
35 {
36 //prefetchw(r-4); FIXME: enable?
37 reduction_t(rightReduction,*r);
38 --r;
39 }
40 if (r<l) break;
41
42 reduction_t(leftReduction ,*r);
43 reduction_t(rightReduction,*l);
44 xchg(*l,*r);
45 l++; r--;
46 }
47
48 return l - array;
49 }
50
51 template<typename T, typename V, typename Vi, typename IsLeft, typename Reduction_T, typename Reduction_V>
52 class __aligned(64) parallel_partition_task
53 {
54 ALIGNED_CLASS_(64);
55 private:
56
57 static const size_t MAX_TASKS = 64;
58
59 T* array;
60 size_t N;
61 const IsLeft& is_left;
62 const Reduction_T& reduction_t;
63 const Reduction_V& reduction_v;
64 const Vi& identity;
65
66 size_t numTasks;
67 __aligned(64) size_t counter_start[MAX_TASKS+1];
68 __aligned(64) size_t counter_left[MAX_TASKS+1];
69 __aligned(64) range<ssize_t> leftMisplacedRanges[MAX_TASKS];
70 __aligned(64) range<ssize_t> rightMisplacedRanges[MAX_TASKS];
71 __aligned(64) V leftReductions[MAX_TASKS];
72 __aligned(64) V rightReductions[MAX_TASKS];
73
74 public:
75
76 __forceinline parallel_partition_task(T* array,
77 const size_t N,
78 const Vi& identity,
79 const IsLeft& is_left,
80 const Reduction_T& reduction_t,
81 const Reduction_V& reduction_v,
82 const size_t BLOCK_SIZE)
83
84 : array(array), N(N), is_left(is_left), reduction_t(reduction_t), reduction_v(reduction_v), identity(identity),
85 numTasks(min(a: (N+BLOCK_SIZE-1)/BLOCK_SIZE,b: min(a: TaskScheduler::threadCount(),b: MAX_TASKS))) {}
86
87 __forceinline const range<ssize_t>* findStartRange(size_t& index, const range<ssize_t>* const r, const size_t numRanges)
88 {
89 size_t i = 0;
90 while(index >= (size_t)r[i].size())
91 {
92 assert(i < numRanges);
93 index -= (size_t)r[i].size();
94 i++;
95 }
96 return &r[i];
97 }
98
99 __forceinline void swapItemsInMisplacedRanges(const size_t numLeftMisplacedRanges,
100 const size_t numRightMisplacedRanges,
101 const size_t startID,
102 const size_t endID)
103 {
104 size_t leftLocalIndex = startID;
105 size_t rightLocalIndex = startID;
106 const range<ssize_t>* l_range = findStartRange(index&: leftLocalIndex,r: leftMisplacedRanges,numRanges: numLeftMisplacedRanges);
107 const range<ssize_t>* r_range = findStartRange(index&: rightLocalIndex,r: rightMisplacedRanges,numRanges: numRightMisplacedRanges);
108
109 size_t l_left = l_range->size() - leftLocalIndex;
110 size_t r_left = r_range->size() - rightLocalIndex;
111 T *__restrict__ l = &array[l_range->begin() + leftLocalIndex];
112 T *__restrict__ r = &array[r_range->begin() + rightLocalIndex];
113 size_t size = endID - startID;
114 size_t items = min(a: size,b: min(a: l_left,b: r_left));
115
116 while (size)
117 {
118 if (unlikely(l_left == 0))
119 {
120 l_range++;
121 l_left = l_range->size();
122 l = &array[l_range->begin()];
123 items = min(a: size,b: min(a: l_left,b: r_left));
124 }
125
126 if (unlikely(r_left == 0))
127 {
128 r_range++;
129 r_left = r_range->size();
130 r = &array[r_range->begin()];
131 items = min(a: size,b: min(a: l_left,b: r_left));
132 }
133
134 size -= items;
135 l_left -= items;
136 r_left -= items;
137
138 while(items) {
139 items--;
140 xchg(*l++,*r++);
141 }
142 }
143 }
144
145 __forceinline size_t partition(V& leftReduction, V& rightReduction)
146 {
147 /* partition the individual ranges for each task */
148 parallel_for(numTasks,[&] (const size_t taskID) {
149 const size_t startID = (taskID+0)*N/numTasks;
150 const size_t endID = (taskID+1)*N/numTasks;
151 V local_left(identity);
152 V local_right(identity);
153 const size_t mid = serial_partitioning(array,startID,endID,local_left,local_right,is_left,reduction_t);
154 counter_start[taskID] = startID;
155 counter_left [taskID] = mid-startID;
156 leftReductions[taskID] = local_left;
157 rightReductions[taskID] = local_right;
158 });
159 counter_start[numTasks] = N;
160 counter_left[numTasks] = 0;
161
162 /* finalize the reductions */
163 for (size_t i=0; i<numTasks; i++) {
164 reduction_v(leftReduction,leftReductions[i]);
165 reduction_v(rightReduction,rightReductions[i]);
166 }
167
168 /* calculate mid point for partitioning */
169 size_t mid = counter_left[0];
170 for (size_t i=1; i<numTasks; i++)
171 mid += counter_left[i];
172 const range<ssize_t> globalLeft (0,mid);
173 const range<ssize_t> globalRight(mid,N);
174
175 /* calculate all left and right ranges that are on the wrong global side */
176 size_t numMisplacedRangesLeft = 0;
177 size_t numMisplacedRangesRight = 0;
178 size_t numMisplacedItemsLeft = 0;
179 size_t numMisplacedItemsRight = 0;
180
181 for (size_t i=0; i<numTasks; i++)
182 {
183 const range<ssize_t> left_range (counter_start[i], counter_start[i] + counter_left[i]);
184 const range<ssize_t> right_range(counter_start[i] + counter_left[i], counter_start[i+1]);
185 const range<ssize_t> left_misplaced = globalLeft. intersect(r: right_range);
186 const range<ssize_t> right_misplaced = globalRight.intersect(r: left_range);
187
188 if (!left_misplaced.empty())
189 {
190 numMisplacedItemsLeft += left_misplaced.size();
191 leftMisplacedRanges[numMisplacedRangesLeft++] = left_misplaced;
192 }
193
194 if (!right_misplaced.empty())
195 {
196 numMisplacedItemsRight += right_misplaced.size();
197 rightMisplacedRanges[numMisplacedRangesRight++] = right_misplaced;
198 }
199 }
200 assert( numMisplacedItemsLeft == numMisplacedItemsRight );
201
202 /* if no items are misplaced we are done */
203 if (numMisplacedItemsLeft == 0)
204 return mid;
205
206 /* otherwise we copy the items to the right place in parallel */
207 parallel_for(numTasks,[&] (const size_t taskID) {
208 const size_t startID = (taskID+0)*numMisplacedItemsLeft/numTasks;
209 const size_t endID = (taskID+1)*numMisplacedItemsLeft/numTasks;
210 swapItemsInMisplacedRanges(numLeftMisplacedRanges: numMisplacedRangesLeft,numRightMisplacedRanges: numMisplacedRangesRight,startID,endID);
211 });
212
213 return mid;
214 }
215 };
216
217 template<typename T, typename V, typename Vi, typename IsLeft, typename Reduction_T, typename Reduction_V>
218 __noinline size_t parallel_partitioning(T* array,
219 const size_t begin,
220 const size_t end,
221 const Vi &identity,
222 V &leftReduction,
223 V &rightReduction,
224 const IsLeft& is_left,
225 const Reduction_T& reduction_t,
226 const Reduction_V& reduction_v,
227 size_t BLOCK_SIZE = 128)
228 {
229 /* fall back to single threaded partitioning for small N */
230 if (unlikely(end-begin < BLOCK_SIZE))
231 return serial_partitioning(array,begin,end,leftReduction,rightReduction,is_left,reduction_t);
232
233 /* otherwise use parallel code */
234 else {
235 typedef parallel_partition_task<T,V,Vi,IsLeft,Reduction_T,Reduction_V> partition_task;
236 std::unique_ptr<partition_task> p(new partition_task(&array[begin],end-begin,identity,is_left,reduction_t,reduction_v,BLOCK_SIZE));
237 return begin+p->partition(leftReduction,rightReduction);
238 }
239 }
240
241 template<typename T, typename V, typename Vi, typename IsLeft, typename Reduction_T, typename Reduction_V>
242 __noinline size_t parallel_partitioning(T* array,
243 const size_t begin,
244 const size_t end,
245 const Vi &identity,
246 V &leftReduction,
247 V &rightReduction,
248 const IsLeft& is_left,
249 const Reduction_T& reduction_t,
250 const Reduction_V& reduction_v,
251 size_t BLOCK_SIZE,
252 size_t PARALLEL_THRESHOLD)
253 {
254 /* fall back to single threaded partitioning for small N */
255 if (unlikely(end-begin < PARALLEL_THRESHOLD))
256 return serial_partitioning(array,begin,end,leftReduction,rightReduction,is_left,reduction_t);
257
258 /* otherwise use parallel code */
259 else {
260 typedef parallel_partition_task<T,V,Vi,IsLeft,Reduction_T,Reduction_V> partition_task;
261 std::unique_ptr<partition_task> p(new partition_task(&array[begin],end-begin,identity,is_left,reduction_t,reduction_v,BLOCK_SIZE));
262 return begin+p->partition(leftReduction,rightReduction);
263 }
264 }
265
266
267 template<typename T, typename IsLeft>
268 inline size_t parallel_partitioning(T* array,
269 const size_t begin,
270 const size_t end,
271 const IsLeft& is_left,
272 size_t BLOCK_SIZE = 128)
273 {
274 size_t leftReduction = 0;
275 size_t rightReduction = 0;
276 return parallel_partitioning(
277 array,begin,end,0,leftReduction,rightReduction,is_left,
278 [] (size_t& t,const T& ref) { },
279 [] (size_t& t0,size_t& t1) { },
280 BLOCK_SIZE);
281 }
282
283}
284

source code of qtquick3d/src/3rdparty/embree/common/algorithms/parallel_partition.h