parallel_sort.h source code [qtquick3d/src/3rdparty/embree/common/algorithms/parallel_sort.h]

1	// Copyright 2009-2021 Intel Corporation
2	// SPDX-License-Identifier: Apache-2.0
3
4	#pragma once
5
6	#include "../simd/simd.h"
7	#include "parallel_for.h"
8	#include <algorithm>
9
10	namespace embree
11	{
12	template<class T>
13	__forceinline void insertionsort_ascending(T *__restrict__ array, const size_t length)
14	{
15	for(size_t i = `1`;i<length;++i)
16	{
17	T v = array[i];
18	size_t j = i;
19	while(j > `0` && v < array[j-`1`])
20	{
21	array[j] = array[j-`1`];
22	--j;
23	}
24	array[j] = v;
25	}
26	}
27
28	template<class T>
29	__forceinline void insertionsort_decending(T *__restrict__ array, const size_t length)
30	{
31	for(size_t i = `1`;i<length;++i)
32	{
33	T v = array[i];
34	size_t j = i;
35	while(j > `0` && v > array[j-`1`])
36	{
37	array[j] = array[j-`1`];
38	--j;
39	}
40	array[j] = v;
41	}
42	}
43
44	template<class T>
45	void quicksort_ascending(T *__restrict__ t,
46	const ssize_t begin,
47	const ssize_t end)
48	{
49	if (likely(begin < end))
50	{
51	const T pivotvalue = t[begin];
52	ssize_t left = begin - `1`;
53	ssize_t right = end + `1`;
54
55	while(`1`)
56	{
57	while (t[--right] > pivotvalue);
58	while (t[++left] < pivotvalue);
59
60	if (left >= right) break;
61
62	const T temp = t[right];
63	t[right] = t[left];
64	t[left] = temp;
65	}
66
67	const int pivot = right;
68	quicksort_ascending(t, begin, pivot);
69	quicksort_ascending(t, pivot + `1`, end);
70	}
71	}
72
73	template<class T>
74	void quicksort_decending(T *__restrict__ t,
75	const ssize_t begin,
76	const ssize_t end)
77	{
78	if (likely(begin < end))
79	{
80	const T pivotvalue = t[begin];
81	ssize_t left = begin - `1`;
82	ssize_t right = end + `1`;
83
84	while(`1`)
85	{
86	while (t[--right] < pivotvalue);
87	while (t[++left] > pivotvalue);
88
89	if (left >= right) break;
90
91	const T temp = t[right];
92	t[right] = t[left];
93	t[left] = temp;
94	}
95
96	const int pivot = right;
97	quicksort_decending(t, begin, pivot);
98	quicksort_decending(t, pivot + `1`, end);
99	}
100	}
101
102
103	template<class T, ssize_t THRESHOLD>
104	void quicksort_insertionsort_ascending(T *__restrict__ t,
105	const ssize_t begin,
106	const ssize_t end)
107	{
108	if (likely(begin < end))
109	{
110	const ssize_t size = end-begin+`1`;
111	if (likely(size <= THRESHOLD))
112	{
113	insertionsort_ascending<T>(&t[begin],size);
114	}
115	else
116	{
117	const T pivotvalue = t[begin];
118	ssize_t left = begin - `1`;
119	ssize_t right = end + `1`;
120
121	while(`1`)
122	{
123	while (t[--right] > pivotvalue);
124	while (t[++left] < pivotvalue);
125
126	if (left >= right) break;
127
128	const T temp = t[right];
129	t[right] = t[left];
130	t[left] = temp;
131	}
132
133	const ssize_t pivot = right;
134	quicksort_insertionsort_ascending<T,THRESHOLD>(t, begin, pivot);
135	quicksort_insertionsort_ascending<T,THRESHOLD>(t, pivot + `1`, end);
136	}
137	}
138	}
139
140
141	template<class T, ssize_t THRESHOLD>
142	void quicksort_insertionsort_decending(T *__restrict__ t,
143	const ssize_t begin,
144	const ssize_t end)
145	{
146	if (likely(begin < end))
147	{
148	const ssize_t size = end-begin+`1`;
149	if (likely(size <= THRESHOLD))
150	{
151	insertionsort_decending<T>(&t[begin],size);
152	}
153	else
154	{
155
156	const T pivotvalue = t[begin];
157	ssize_t left = begin - `1`;
158	ssize_t right = end + `1`;
159
160	while(`1`)
161	{
162	while (t[--right] < pivotvalue);
163	while (t[++left] > pivotvalue);
164
165	if (left >= right) break;
166
167	const T temp = t[right];
168	t[right] = t[left];
169	t[left] = temp;
170	}
171
172	const ssize_t pivot = right;
173	quicksort_insertionsort_decending<T,THRESHOLD>(t, begin, pivot);
174	quicksort_insertionsort_decending<T,THRESHOLD>(t, pivot + `1`, end);
175	}
176	}
177	}
178
179	template<typename T>
180	static void radixsort32(T* const morton, const size_t num, const unsigned int shift = `3`*`8`)
181	{
182	static const unsigned int BITS = `8`;
183	static const unsigned int BUCKETS = (`1` << BITS);
184	static const unsigned int CMP_SORT_THRESHOLD = `16`;
185
186	__aligned(`64`) unsigned int count[BUCKETS];
187
188	/ clear buckets /
189	for (size_t i=`0`;i<BUCKETS;i++) count[i] = `0`;
190
191	/ count buckets /
192	#if defined(__INTEL_COMPILER)
193	#pragma nounroll
194	#endif
195	for (size_t i=`0`;i<num;i++)
196	count[(unsigned(morton[i]) >> shift) & (BUCKETS-`1`)]++;
197
198	/ prefix sums /
199	__aligned(`64`) unsigned int head[BUCKETS];
200	__aligned(`64`) unsigned int tail[BUCKETS];
201
202	head[`0`] = `0`;
203	for (size_t i=`1`; i<BUCKETS; i++)
204	head[i] = head[i-`1`] + count[i-`1`];
205
206	for (size_t i=`0`; i<BUCKETS-`1`; i++)
207	tail[i] = head[i+`1`];
208
209	tail[BUCKETS-`1`] = head[BUCKETS-`1`] + count[BUCKETS-`1`];
210
211	assert(tail[BUCKETS-`1`] == head[BUCKETS-`1`] + count[BUCKETS-`1`]);
212	assert(tail[BUCKETS-`1`] == num);
213
214	/ in-place swap /
215	for (size_t i=`0`;i<BUCKETS;i++)
216	{
217	/ process bucket /
218	while(head[i] < tail[i])
219	{
220	T v = morton[head[i]];
221	while(`1`)
222	{
223	const size_t b = (unsigned(v) >> shift) & (BUCKETS-`1`);
224	if (b == i) break;
225	std::swap(v,morton[head[b]++]);
226	}
227	assert((unsigned(v) >> shift & (BUCKETS-`1`)) == i);
228	morton[head[i]++] = v;
229	}
230	}
231	if (shift == `0`) return;
232
233	size_t offset = `0`;
234	for (size_t i=`0`;i<BUCKETS;i++)
235	if (count[i])
236	{
237
238	for (size_t j=offset;j<offset+count[i]-`1`;j++)
239	assert(((unsigned(morton[j]) >> shift) & (BUCKETS-`1`)) == i);
240
241	if (unlikely(count[i] < CMP_SORT_THRESHOLD))
242	insertionsort_ascending(morton + offset, count[i]);
243	else
244	radixsort32(morton + offset, count[i], shift-BITS);
245
246	for (size_t j=offset;j<offset+count[i]-`1`;j++)
247	assert(morton[j] <= morton[j+`1`]);
248
249	offset += count[i];
250	}
251	}
252
253	template<typename Ty, typename Key>
254	class ParallelRadixSort
255	{
256	static const size_t MAX_TASKS = `64`;
257	static const size_t BITS = `8`;
258	static const size_t BUCKETS = (`1` << BITS);
259	typedef unsigned int TyRadixCount[BUCKETS];
260
261	template<typename T>
262	static bool compare(const T& v0, const T& v1) {
263	return (Key)v0 < (Key)v1;
264	}
265
266	private:
267	ParallelRadixSort (const ParallelRadixSort& other) DELETED; // do not implement
268	ParallelRadixSort& operator= (const ParallelRadixSort& other) DELETED; // do not implement
269
270
271	public:
272	ParallelRadixSort (Ty* const src, Ty* const tmp, const size_t N)
273	: radixCount(nullptr), src(src), tmp(tmp), N(N) {}
274
275	void sort(const size_t blockSize)
276	{
277	assert(blockSize > `0`);
278
279	/ perform single threaded sort for small N /
280	if (N<=blockSize) // handles also special case of 0!
281	{
282	/ do inplace sort inside destination array /
283	std::sort(src,src+N,compare<Ty>);
284	}
285
286	/ perform parallel sort for large N /
287	else
288	{
289	const size_t numThreads = min(a: (N+blockSize-`1`)/blockSize,b: TaskScheduler::threadCount(),c: size_t(MAX_TASKS));
290	tbbRadixSort(numTasks: numThreads);
291	}
292	}
293
294	~ParallelRadixSort()
295	{
296	alignedFree(ptr: radixCount);
297	radixCount = nullptr;
298	}
299
300	private:
301
302	void tbbRadixIteration0(const Key shift,
303	const Ty* __restrict const src,
304	Ty* __restrict const dst,
305	const size_t threadIndex, const size_t threadCount)
306	{
307	const size_t startID = (threadIndex+`0`)*N/threadCount;
308	const size_t endID = (threadIndex+`1`)*N/threadCount;
309
310	/ mask to extract some number of bits /
311	const Key mask = BUCKETS-`1`;
312
313	/ count how many items go into the buckets /
314	for (size_t i=`0`; i<BUCKETS; i++)
315	radixCount[threadIndex][i] = `0`;
316
317	/ iterate over src array and count buckets /
318	unsigned int * __restrict const count = radixCount[threadIndex];
319	#if defined(__INTEL_COMPILER)
320	#pragma nounroll
321	#endif
322	for (size_t i=startID; i<endID; i++) {
323	#if defined(__64BIT__)
324	const size_t index = ((size_t)(Key)src[i] >> (size_t)shift) & (size_t)mask;
325	#else
326	const Key index = ((Key)src[i] >> shift) & mask;
327	#endif
328	count[index]++;
329	}
330	}
331
332	void tbbRadixIteration1(const Key shift,
333	const Ty* __restrict const src,
334	Ty* __restrict const dst,
335	const size_t threadIndex, const size_t threadCount)
336	{
337	const size_t startID = (threadIndex+`0`)*N/threadCount;
338	const size_t endID = (threadIndex+`1`)*N/threadCount;
339
340	/ mask to extract some number of bits /
341	const Key mask = BUCKETS-`1`;
342
343	/ calculate total number of items for each bucket /
344	__aligned(`64`) unsigned int total[BUCKETS];
345	/*
346	for (size_t i=0; i<BUCKETS; i++)
347	total[i] = 0;
348	*/
349	for (size_t i=`0`; i<BUCKETS; i+=VSIZEX)
350	vintx::store(ptr: &total[i], v: zero);
351
352	for (size_t i=`0`; i<threadCount; i++)
353	{
354	/*
355	for (size_t j=0; j<BUCKETS; j++)
356	total[j] += radixCount[i][j];
357	*/
358	for (size_t j=`0`; j<BUCKETS; j+=VSIZEX)
359	vintx::store(ptr: &total[j], v: vintx::load(a: &total[j]) + vintx::load(a: &radixCount[i][j]));
360	}
361
362	/ calculate start offset of each bucket /
363	__aligned(`64`) unsigned int offset[BUCKETS];
364	offset[`0`] = `0`;
365	for (size_t i=`1`; i<BUCKETS; i++)
366	offset[i] = offset[i-`1`] + total[i-`1`];
367
368	/ calculate start offset of each bucket for this thread /
369	for (size_t i=`0`; i<threadIndex; i++)
370	{
371	/*
372	for (size_t j=0; j<BUCKETS; j++)
373	offset[j] += radixCount[i][j];
374	*/
375	for (size_t j=`0`; j<BUCKETS; j+=VSIZEX)
376	vintx::store(ptr: &offset[j], v: vintx::load(a: &offset[j]) + vintx::load(a: &radixCount[i][j]));
377	}
378
379	/ copy items into their buckets /
380	#if defined(__INTEL_COMPILER)
381	#pragma nounroll
382	#endif
383	for (size_t i=startID; i<endID; i++) {
384	const Ty elt = src[i];
385	#if defined(__64BIT__)
386	const size_t index = ((size_t)(Key)src[i] >> (size_t)shift) & (size_t)mask;
387	#else
388	const size_t index = ((Key)src[i] >> shift) & mask;
389	#endif
390	dst[offset[index]++] = elt;
391	}
392	}
393
394	void tbbRadixIteration(const Key shift, const bool last,
395	const Ty* __restrict src, Ty* __restrict dst,
396	const size_t numTasks)
397	{
398	affinity_partitioner ap;
399	parallel_for_affinity(numTasks,[&] (size_t taskIndex) { tbbRadixIteration0(shift,src,dst,threadIndex: taskIndex,threadCount: numTasks); },ap);
400	parallel_for_affinity(numTasks,[&] (size_t taskIndex) { tbbRadixIteration1(shift,src,dst,threadIndex: taskIndex,threadCount: numTasks); },ap);
401	}
402
403	void tbbRadixSort(const size_t numTasks)
404	{
405	radixCount = (TyRadixCount) alignedMalloc(size: MAX_TASKSsizeof(TyRadixCount),align: `64`);
406
407	if (sizeof(Key) == sizeof(uint32_t)) {
408	tbbRadixIteration(shift: `0`*BITS,last: `0`,src,dst: tmp,numTasks);
409	tbbRadixIteration(shift: `1`*BITS,last: `0`,src: tmp,dst: src,numTasks);
410	tbbRadixIteration(shift: `2`*BITS,last: `0`,src,dst: tmp,numTasks);
411	tbbRadixIteration(shift: `3`*BITS,last: `1`,src: tmp,dst: src,numTasks);
412	}
413	else if (sizeof(Key) == sizeof(uint64_t))
414	{
415	tbbRadixIteration(shift: `0`*BITS,last: `0`,src,dst: tmp,numTasks);
416	tbbRadixIteration(shift: `1`*BITS,last: `0`,src: tmp,dst: src,numTasks);
417	tbbRadixIteration(shift: `2`*BITS,last: `0`,src,dst: tmp,numTasks);
418	tbbRadixIteration(shift: `3`*BITS,last: `0`,src: tmp,dst: src,numTasks);
419	tbbRadixIteration(shift: `4`*BITS,last: `0`,src,dst: tmp,numTasks);
420	tbbRadixIteration(shift: `5`*BITS,last: `0`,src: tmp,dst: src,numTasks);
421	tbbRadixIteration(shift: `6`*BITS,last: `0`,src,dst: tmp,numTasks);
422	tbbRadixIteration(shift: `7`*BITS,last: `1`,src: tmp,dst: src,numTasks);
423	}
424	}
425
426	private:
427	TyRadixCount* radixCount;
428	Ty* const src;
429	Ty* const tmp;
430	const size_t N;
431	};
432
433	template<typename Ty>
434	void radix_sort(Ty* const src, Ty* const tmp, const size_t N, const size_t blockSize = `8192`)
435	{
436	ParallelRadixSort<Ty,Ty>(src,tmp,N).sort(blockSize);
437	}
438
439	template<typename Ty, typename Key>
440	void radix_sort(Ty* const src, Ty* const tmp, const size_t N, const size_t blockSize = `8192`)
441	{
442	ParallelRadixSort<Ty,Key>(src,tmp,N).sort(blockSize);
443	}
444
445	template<typename Ty>
446	void radix_sort_u32(Ty* const src, Ty* const tmp, const size_t N, const size_t blockSize = `8192`) {
447	radix_sort<Ty,uint32_t>(src,tmp,N,blockSize);
448	}
449
450	template<typename Ty>
451	void radix_sort_u64(Ty* const src, Ty* const tmp, const size_t N, const size_t blockSize = `8192`) {
452	radix_sort<Ty,uint64_t>(src,tmp,N,blockSize);
453	}
454	}
455

source code of qtquick3d/src/3rdparty/embree/common/algorithms/parallel_sort.h