1 | // Copyright 2009-2021 Intel Corporation |
2 | // SPDX-License-Identifier: Apache-2.0 |
3 | |
4 | #pragma once |
5 | |
6 | #include "parallel_for.h" |
7 | #include "../math/range.h" |
8 | |
9 | namespace embree |
10 | { |
11 | /* serial partitioning */ |
12 | template<typename T, typename V, typename IsLeft, typename Reduction_T> |
13 | __forceinline size_t serial_partitioning(T* array, |
14 | const size_t begin, |
15 | const size_t end, |
16 | V& leftReduction, |
17 | V& rightReduction, |
18 | const IsLeft& is_left, |
19 | const Reduction_T& reduction_t) |
20 | { |
21 | T* l = array + begin; |
22 | T* r = array + end - 1; |
23 | |
24 | while(1) |
25 | { |
26 | /* *l < pivot */ |
27 | while (likely(l <= r && is_left(*l) )) |
28 | { |
29 | //prefetchw(l+4); // FIXME: enable? |
30 | reduction_t(leftReduction,*l); |
31 | ++l; |
32 | } |
33 | /* *r >= pivot) */ |
34 | while (likely(l <= r && !is_left(*r))) |
35 | { |
36 | //prefetchw(r-4); FIXME: enable? |
37 | reduction_t(rightReduction,*r); |
38 | --r; |
39 | } |
40 | if (r<l) break; |
41 | |
42 | reduction_t(leftReduction ,*r); |
43 | reduction_t(rightReduction,*l); |
44 | xchg(*l,*r); |
45 | l++; r--; |
46 | } |
47 | |
48 | return l - array; |
49 | } |
50 | |
51 | template<typename T, typename V, typename Vi, typename IsLeft, typename Reduction_T, typename Reduction_V> |
52 | class __aligned(64) parallel_partition_task |
53 | { |
54 | ALIGNED_CLASS_(64); |
55 | private: |
56 | |
57 | static const size_t MAX_TASKS = 64; |
58 | |
59 | T* array; |
60 | size_t N; |
61 | const IsLeft& is_left; |
62 | const Reduction_T& reduction_t; |
63 | const Reduction_V& reduction_v; |
64 | const Vi& identity; |
65 | |
66 | size_t numTasks; |
67 | __aligned(64) size_t counter_start[MAX_TASKS+1]; |
68 | __aligned(64) size_t counter_left[MAX_TASKS+1]; |
69 | __aligned(64) range<ssize_t> leftMisplacedRanges[MAX_TASKS]; |
70 | __aligned(64) range<ssize_t> rightMisplacedRanges[MAX_TASKS]; |
71 | __aligned(64) V leftReductions[MAX_TASKS]; |
72 | __aligned(64) V rightReductions[MAX_TASKS]; |
73 | |
74 | public: |
75 | |
76 | __forceinline parallel_partition_task(T* array, |
77 | const size_t N, |
78 | const Vi& identity, |
79 | const IsLeft& is_left, |
80 | const Reduction_T& reduction_t, |
81 | const Reduction_V& reduction_v, |
82 | const size_t BLOCK_SIZE) |
83 | |
84 | : array(array), N(N), is_left(is_left), reduction_t(reduction_t), reduction_v(reduction_v), identity(identity), |
85 | numTasks(min(a: (N+BLOCK_SIZE-1)/BLOCK_SIZE,b: min(a: TaskScheduler::threadCount(),b: MAX_TASKS))) {} |
86 | |
87 | __forceinline const range<ssize_t>* findStartRange(size_t& index, const range<ssize_t>* const r, const size_t numRanges) |
88 | { |
89 | size_t i = 0; |
90 | while(index >= (size_t)r[i].size()) |
91 | { |
92 | assert(i < numRanges); |
93 | index -= (size_t)r[i].size(); |
94 | i++; |
95 | } |
96 | return &r[i]; |
97 | } |
98 | |
99 | __forceinline void swapItemsInMisplacedRanges(const size_t numLeftMisplacedRanges, |
100 | const size_t numRightMisplacedRanges, |
101 | const size_t startID, |
102 | const size_t endID) |
103 | { |
104 | size_t leftLocalIndex = startID; |
105 | size_t rightLocalIndex = startID; |
106 | const range<ssize_t>* l_range = findStartRange(index&: leftLocalIndex,r: leftMisplacedRanges,numRanges: numLeftMisplacedRanges); |
107 | const range<ssize_t>* r_range = findStartRange(index&: rightLocalIndex,r: rightMisplacedRanges,numRanges: numRightMisplacedRanges); |
108 | |
109 | size_t l_left = l_range->size() - leftLocalIndex; |
110 | size_t r_left = r_range->size() - rightLocalIndex; |
111 | T *__restrict__ l = &array[l_range->begin() + leftLocalIndex]; |
112 | T *__restrict__ r = &array[r_range->begin() + rightLocalIndex]; |
113 | size_t size = endID - startID; |
114 | size_t items = min(a: size,b: min(a: l_left,b: r_left)); |
115 | |
116 | while (size) |
117 | { |
118 | if (unlikely(l_left == 0)) |
119 | { |
120 | l_range++; |
121 | l_left = l_range->size(); |
122 | l = &array[l_range->begin()]; |
123 | items = min(a: size,b: min(a: l_left,b: r_left)); |
124 | } |
125 | |
126 | if (unlikely(r_left == 0)) |
127 | { |
128 | r_range++; |
129 | r_left = r_range->size(); |
130 | r = &array[r_range->begin()]; |
131 | items = min(a: size,b: min(a: l_left,b: r_left)); |
132 | } |
133 | |
134 | size -= items; |
135 | l_left -= items; |
136 | r_left -= items; |
137 | |
138 | while(items) { |
139 | items--; |
140 | xchg(*l++,*r++); |
141 | } |
142 | } |
143 | } |
144 | |
145 | __forceinline size_t partition(V& leftReduction, V& rightReduction) |
146 | { |
147 | /* partition the individual ranges for each task */ |
148 | parallel_for(numTasks,[&] (const size_t taskID) { |
149 | const size_t startID = (taskID+0)*N/numTasks; |
150 | const size_t endID = (taskID+1)*N/numTasks; |
151 | V local_left(identity); |
152 | V local_right(identity); |
153 | const size_t mid = serial_partitioning(array,startID,endID,local_left,local_right,is_left,reduction_t); |
154 | counter_start[taskID] = startID; |
155 | counter_left [taskID] = mid-startID; |
156 | leftReductions[taskID] = local_left; |
157 | rightReductions[taskID] = local_right; |
158 | }); |
159 | counter_start[numTasks] = N; |
160 | counter_left[numTasks] = 0; |
161 | |
162 | /* finalize the reductions */ |
163 | for (size_t i=0; i<numTasks; i++) { |
164 | reduction_v(leftReduction,leftReductions[i]); |
165 | reduction_v(rightReduction,rightReductions[i]); |
166 | } |
167 | |
168 | /* calculate mid point for partitioning */ |
169 | size_t mid = counter_left[0]; |
170 | for (size_t i=1; i<numTasks; i++) |
171 | mid += counter_left[i]; |
172 | const range<ssize_t> globalLeft (0,mid); |
173 | const range<ssize_t> globalRight(mid,N); |
174 | |
175 | /* calculate all left and right ranges that are on the wrong global side */ |
176 | size_t numMisplacedRangesLeft = 0; |
177 | size_t numMisplacedRangesRight = 0; |
178 | size_t numMisplacedItemsLeft = 0; |
179 | size_t numMisplacedItemsRight = 0; |
180 | |
181 | for (size_t i=0; i<numTasks; i++) |
182 | { |
183 | const range<ssize_t> left_range (counter_start[i], counter_start[i] + counter_left[i]); |
184 | const range<ssize_t> right_range(counter_start[i] + counter_left[i], counter_start[i+1]); |
185 | const range<ssize_t> left_misplaced = globalLeft. intersect(r: right_range); |
186 | const range<ssize_t> right_misplaced = globalRight.intersect(r: left_range); |
187 | |
188 | if (!left_misplaced.empty()) |
189 | { |
190 | numMisplacedItemsLeft += left_misplaced.size(); |
191 | leftMisplacedRanges[numMisplacedRangesLeft++] = left_misplaced; |
192 | } |
193 | |
194 | if (!right_misplaced.empty()) |
195 | { |
196 | numMisplacedItemsRight += right_misplaced.size(); |
197 | rightMisplacedRanges[numMisplacedRangesRight++] = right_misplaced; |
198 | } |
199 | } |
200 | assert( numMisplacedItemsLeft == numMisplacedItemsRight ); |
201 | |
202 | /* if no items are misplaced we are done */ |
203 | if (numMisplacedItemsLeft == 0) |
204 | return mid; |
205 | |
206 | /* otherwise we copy the items to the right place in parallel */ |
207 | parallel_for(numTasks,[&] (const size_t taskID) { |
208 | const size_t startID = (taskID+0)*numMisplacedItemsLeft/numTasks; |
209 | const size_t endID = (taskID+1)*numMisplacedItemsLeft/numTasks; |
210 | swapItemsInMisplacedRanges(numLeftMisplacedRanges: numMisplacedRangesLeft,numRightMisplacedRanges: numMisplacedRangesRight,startID,endID); |
211 | }); |
212 | |
213 | return mid; |
214 | } |
215 | }; |
216 | |
217 | template<typename T, typename V, typename Vi, typename IsLeft, typename Reduction_T, typename Reduction_V> |
218 | __noinline size_t parallel_partitioning(T* array, |
219 | const size_t begin, |
220 | const size_t end, |
221 | const Vi &identity, |
222 | V &leftReduction, |
223 | V &rightReduction, |
224 | const IsLeft& is_left, |
225 | const Reduction_T& reduction_t, |
226 | const Reduction_V& reduction_v, |
227 | size_t BLOCK_SIZE = 128) |
228 | { |
229 | /* fall back to single threaded partitioning for small N */ |
230 | if (unlikely(end-begin < BLOCK_SIZE)) |
231 | return serial_partitioning(array,begin,end,leftReduction,rightReduction,is_left,reduction_t); |
232 | |
233 | /* otherwise use parallel code */ |
234 | else { |
235 | typedef parallel_partition_task<T,V,Vi,IsLeft,Reduction_T,Reduction_V> partition_task; |
236 | std::unique_ptr<partition_task> p(new partition_task(&array[begin],end-begin,identity,is_left,reduction_t,reduction_v,BLOCK_SIZE)); |
237 | return begin+p->partition(leftReduction,rightReduction); |
238 | } |
239 | } |
240 | |
241 | template<typename T, typename V, typename Vi, typename IsLeft, typename Reduction_T, typename Reduction_V> |
242 | __noinline size_t parallel_partitioning(T* array, |
243 | const size_t begin, |
244 | const size_t end, |
245 | const Vi &identity, |
246 | V &leftReduction, |
247 | V &rightReduction, |
248 | const IsLeft& is_left, |
249 | const Reduction_T& reduction_t, |
250 | const Reduction_V& reduction_v, |
251 | size_t BLOCK_SIZE, |
252 | size_t PARALLEL_THRESHOLD) |
253 | { |
254 | /* fall back to single threaded partitioning for small N */ |
255 | if (unlikely(end-begin < PARALLEL_THRESHOLD)) |
256 | return serial_partitioning(array,begin,end,leftReduction,rightReduction,is_left,reduction_t); |
257 | |
258 | /* otherwise use parallel code */ |
259 | else { |
260 | typedef parallel_partition_task<T,V,Vi,IsLeft,Reduction_T,Reduction_V> partition_task; |
261 | std::unique_ptr<partition_task> p(new partition_task(&array[begin],end-begin,identity,is_left,reduction_t,reduction_v,BLOCK_SIZE)); |
262 | return begin+p->partition(leftReduction,rightReduction); |
263 | } |
264 | } |
265 | |
266 | |
267 | template<typename T, typename IsLeft> |
268 | inline size_t parallel_partitioning(T* array, |
269 | const size_t begin, |
270 | const size_t end, |
271 | const IsLeft& is_left, |
272 | size_t BLOCK_SIZE = 128) |
273 | { |
274 | size_t leftReduction = 0; |
275 | size_t rightReduction = 0; |
276 | return parallel_partitioning( |
277 | array,begin,end,0,leftReduction,rightReduction,is_left, |
278 | [] (size_t& t,const T& ref) { }, |
279 | [] (size_t& t0,size_t& t1) { }, |
280 | BLOCK_SIZE); |
281 | } |
282 | |
283 | } |
284 | |