parallel_backend_tbb.h source code [include/c++/11/pstl/parallel_backend_tbb.h]

1	// -- C++ --
2	//===-- parallel_backend_tbb.h --------------------------------------------===//
3	//
4	// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
5	// See https://llvm.org/LICENSE.txt for license information.
6	// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
7	//
8	//===----------------------------------------------------------------------===//
9
10	#ifndef _PSTL_PARALLEL_BACKEND_TBB_H
11	#define _PSTL_PARALLEL_BACKEND_TBB_H
12
13	#include <algorithm>
14	#include <type_traits>
15
16	#include "parallel_backend_utils.h"
17
18	// Bring in minimal required subset of Intel TBB
19	#include <tbb/blocked_range.h>
20	#include <tbb/parallel_for.h>
21	#include <tbb/parallel_reduce.h>
22	#include <tbb/parallel_scan.h>
23	#include <tbb/parallel_invoke.h>
24	#include <tbb/task_arena.h>
25	#include <tbb/tbb_allocator.h>
26	#include <tbb/task.h>
27
28	#if TBB_INTERFACE_VERSION < 10000
29	# error Intel(R) Threading Building Blocks 2018 is required; older versions are not supported.
30	#endif
31
32	namespace __pstl
33	{
34	namespace __tbb_backend
35	{
36
37	//! Raw memory buffer with automatic freeing and no exceptions.
38	/* Some of our algorithms need to start with raw memory buffer,*
39	not an initialize array, because initialization/destruction
40	would make the span be at least O(N). /*
41	// tbb::allocator can improve performance in some cases.
42	template <typename _Tp>
43	class __buffer
44	{
45	tbb::tbb_allocator<_Tp> _M_allocator;
46	_Tp* _M_ptr;
47	const std::size_t _M_buf_size;
48	__buffer(const __buffer&) = delete;
49	void
50	operator=(const __buffer&) = delete;
51
52	public:
53	//! Try to obtain buffer of given size to store objects of _Tp type
54	__buffer(std::size_t n) : _M_allocator(), _M_ptr(_M_allocator.allocate(n)), _M_buf_size(n) {}
55	//! True if buffer was successfully obtained, zero otherwise.
56	operator bool() const { return _M_ptr != NULL; }
57	//! Return pointer to buffer, or NULL if buffer could not be obtained.
58	_Tp*
59	get() const
60	{
61	return _M_ptr;
62	}
63	//! Destroy buffer
64	~__buffer() { _M_allocator.deallocate(_M_ptr, _M_buf_size); }
65	};
66
67	// Wrapper for tbb::task
68	inline void
69	__cancel_execution()
70	{
71	#if TBB_INTERFACE_VERSION <= 12000
72	tbb::task::self().group()->cancel_group_execution();
73	#else
74	tbb::task::current_context()->cancel_group_execution();
75	#endif
76	}
77
78	//------------------------------------------------------------------------
79	// parallel_for
80	//------------------------------------------------------------------------
81
82	template <class _Index, class _RealBody>
83	class __parallel_for_body
84	{
85	public:
86	__parallel_for_body(const _RealBody& __body) : _M_body(__body) {}
87	__parallel_for_body(const __parallel_for_body& __body) : _M_body(__body._M_body) {}
88	void
89	operator()(const tbb::blocked_range<_Index>& __range) const
90	{
91	_M_body(__range.begin(), __range.end());
92	}
93
94	private:
95	_RealBody _M_body;
96	};
97
98	//! Evaluation of brick f[i,j) for each subrange [i,j) of [first,last)
99	// wrapper over tbb::parallel_for
100	template <class _ExecutionPolicy, class _Index, class _Fp>
101	void
102	__parallel_for(_ExecutionPolicy&&, _Index __first, _Index __last, _Fp __f)
103	{
104	tbb::this_task_arena::isolate([=]() {
105	tbb::parallel_for(tbb::blocked_range<_Index>(__first, __last), __parallel_for_body<_Index, _Fp>(__f));
106	});
107	}
108
109	//! Evaluation of brick f[i,j) for each subrange [i,j) of [first,last)
110	// wrapper over tbb::parallel_reduce
111	template <class _ExecutionPolicy, class _Value, class _Index, typename _RealBody, typename _Reduction>
112	_Value
113	__parallel_reduce(_ExecutionPolicy&&, _Index __first, _Index __last, const _Value& __identity,
114	const _RealBody& __real_body, const _Reduction& __reduction)
115	{
116	return tbb::this_task_arena::isolate([__first, __last, &__identity, &__real_body, &__reduction]() -> _Value {
117	return tbb::parallel_reduce(
118	tbb::blocked_range<_Index>(__first, __last), __identity,
119	[__real_body](const tbb::blocked_range<_Index>& __r, const _Value& __value) -> _Value {
120	return __real_body(__r.begin(), __r.end(), __value);
121	},
122	__reduction);
123	});
124	}
125
126	//------------------------------------------------------------------------
127	// parallel_transform_reduce
128	//
129	// Notation:
130	// r(i,j,init) returns reduction of init with reduction over [i,j)
131	// u(i) returns f(i,i+1,identity) for a hypothetical left identity element of r
132	// c(x,y) combines values x and y that were the result of r or u
133	//------------------------------------------------------------------------
134
135	template <class _Index, class _Up, class _Tp, class _Cp, class _Rp>
136	struct __par_trans_red_body
137	{
138	alignas(_Tp) char _M_sum_storage[sizeof(_Tp)]; // Holds generalized non-commutative sum when has_sum==true
139	_Rp _M_brick_reduce; // Most likely to have non-empty layout
140	_Up _M_u;
141	_Cp _M_combine;
142	bool _M_has_sum; // Put last to minimize size of class
143	_Tp&
144	sum()
145	{
146	_PSTL_ASSERT_MSG(_M_has_sum, "sum expected");
147	return (_Tp)_M_sum_storage;
148	}
149	__par_trans_red_body(_Up __u, _Tp __init, _Cp __c, _Rp __r)
150	: _M_brick_reduce(__r), _M_u(__u), _M_combine(__c), _M_has_sum(true)
151	{
152	new (_M_sum_storage) _Tp(__init);
153	}
154
155	__par_trans_red_body(__par_trans_red_body& __left, tbb::split)
156	: _M_brick_reduce(__left._M_brick_reduce), _M_u(__left._M_u), _M_combine(__left._M_combine), _M_has_sum(false)
157	{
158	}
159
160	~__par_trans_red_body()
161	{
162	// 17.6.5.12 tells us to not worry about catching exceptions from destructors.
163	if (_M_has_sum)
164	sum().~_Tp();
165	}
166
167	void
168	join(__par_trans_red_body& __rhs)
169	{
170	sum() = _M_combine(sum(), __rhs.sum());
171	}
172
173	void
174	operator()(const tbb::blocked_range<_Index>& __range)
175	{
176	_Index __i = __range.begin();
177	_Index __j = __range.end();
178	if (!_M_has_sum)
179	{
180	_PSTL_ASSERT_MSG(__range.size() > `1`, "there should be at least 2 elements");
181	new (&_M_sum_storage)
182	_Tp(_M_combine(_M_u(__i), _M_u(__i + `1`))); // The condition i+1 < j is provided by the grain size of 3
183	_M_has_sum = true;
184	std::advance(__i, `2`);
185	if (__i == __j)
186	return;
187	}
188	sum() = _M_brick_reduce(__i, __j, sum());
189	}
190	};
191
192	template <class _ExecutionPolicy, class _Index, class _Up, class _Tp, class _Cp, class _Rp>
193	_Tp
194	__parallel_transform_reduce(_ExecutionPolicy&&, _Index __first, _Index __last, _Up __u, _Tp __init, _Cp __combine,
195	_Rp __brick_reduce)
196	{
197	__tbb_backend::__par_trans_red_body<_Index, _Up, _Tp, _Cp, _Rp> __body(__u, __init, __combine, __brick_reduce);
198	// The grain size of 3 is used in order to provide mininum 2 elements for each body
199	tbb::this_task_arena::isolate(
200	[__first, __last, &__body]() { tbb::parallel_reduce(tbb::blocked_range<_Index>(__first, __last, `3`), __body); });
201	return __body.sum();
202	}
203
204	//------------------------------------------------------------------------
205	// parallel_scan
206	//------------------------------------------------------------------------
207
208	template <class _Index, class _Up, class _Tp, class _Cp, class _Rp, class _Sp>
209	class __trans_scan_body
210	{
211	alignas(_Tp) char _M_sum_storage[sizeof(_Tp)]; // Holds generalized non-commutative sum when has_sum==true
212	_Rp _M_brick_reduce; // Most likely to have non-empty layout
213	_Up _M_u;
214	_Cp _M_combine;
215	_Sp _M_scan;
216	bool _M_has_sum; // Put last to minimize size of class
217	public:
218	__trans_scan_body(_Up __u, _Tp __init, _Cp __combine, _Rp __reduce, _Sp __scan)
219	: _M_brick_reduce(__reduce), _M_u(__u), _M_combine(__combine), _M_scan(__scan), _M_has_sum(true)
220	{
221	new (_M_sum_storage) _Tp(__init);
222	}
223
224	__trans_scan_body(__trans_scan_body& __b, tbb::split)
225	: _M_brick_reduce(__b._M_brick_reduce), _M_u(__b._M_u), _M_combine(__b._M_combine), _M_scan(__b._M_scan),
226	_M_has_sum(false)
227	{
228	}
229
230	~__trans_scan_body()
231	{
232	// 17.6.5.12 tells us to not worry about catching exceptions from destructors.
233	if (_M_has_sum)
234	sum().~_Tp();
235	}
236
237	_Tp&
238	sum() const
239	{
240	_PSTL_ASSERT_MSG(_M_has_sum, "sum expected");
241	return *const_cast<_Tp>(reinterpret_cast<_Tp const**>(_M_sum_storage));
242	}
243
244	void
245	operator()(const tbb::blocked_range<_Index>& __range, tbb::pre_scan_tag)
246	{
247	_Index __i = __range.begin();
248	_Index __j = __range.end();
249	if (!_M_has_sum)
250	{
251	new (&_M_sum_storage) _Tp(_M_u(__i));
252	_M_has_sum = true;
253	++__i;
254	if (__i == __j)
255	return;
256	}
257	sum() = _M_brick_reduce(__i, __j, sum());
258	}
259
260	void
261	operator()(const tbb::blocked_range<_Index>& __range, tbb::final_scan_tag)
262	{
263	sum() = _M_scan(__range.begin(), __range.end(), sum());
264	}
265
266	void
267	reverse_join(__trans_scan_body& __a)
268	{
269	if (_M_has_sum)
270	{
271	sum() = _M_combine(__a.sum(), sum());
272	}
273	else
274	{
275	new (&_M_sum_storage) _Tp(__a.sum());
276	_M_has_sum = true;
277	}
278	}
279
280	void
281	assign(__trans_scan_body& __b)
282	{
283	sum() = __b.sum();
284	}
285	};
286
287	template <typename _Index>
288	_Index
289	__split(_Index __m)
290	{
291	_Index __k = `1`;
292	while (`2` * __k < __m)
293	__k *= `2`;
294	return __k;
295	}
296
297	//------------------------------------------------------------------------
298	// __parallel_strict_scan
299	//------------------------------------------------------------------------
300
301	template <typename _Index, typename _Tp, typename _Rp, typename _Cp>
302	void
303	__upsweep(_Index __i, _Index __m, _Index __tilesize, _Tp* __r, _Index __lastsize, _Rp __reduce, _Cp __combine)
304	{
305	if (__m == `1`)
306	__r[`0`] = __reduce(__i * __tilesize, __lastsize);
307	else
308	{
309	_Index __k = __split(__m);
310	tbb::parallel_invoke(
311	[=] { __tbb_backend::__upsweep(__i, __k, __tilesize, __r, __tilesize, __reduce, __combine); },
312	[=] {
313	__tbb_backend::__upsweep(__i + __k, __m - __k, __tilesize, __r + __k, __lastsize, __reduce, __combine);
314	});
315	if (__m == `2` * __k)
316	__r[__m - `1`] = __combine(__r[__k - `1`], __r[__m - `1`]);
317	}
318	}
319
320	template <typename _Index, typename _Tp, typename _Cp, typename _Sp>
321	void
322	__downsweep(_Index __i, _Index __m, _Index __tilesize, _Tp* __r, _Index __lastsize, _Tp __initial, _Cp __combine,
323	_Sp __scan)
324	{
325	if (__m == `1`)
326	__scan(__i * __tilesize, __lastsize, __initial);
327	else
328	{
329	const _Index __k = __split(__m);
330	tbb::parallel_invoke(
331	[=] { __tbb_backend::__downsweep(__i, __k, __tilesize, __r, __tilesize, __initial, __combine, __scan); },
332	// Assumes that __combine never throws.
333	//TODO: Consider adding a requirement for user functors to be constant.
334	[=, &__combine] {
335	__tbb_backend::__downsweep(__i + __k, __m - __k, __tilesize, __r + __k, __lastsize,
336	__combine(__initial, __r[__k - `1`]), __combine, __scan);
337	});
338	}
339	}
340
341	// Adapted from Intel(R) Cilk(TM) version from cilkpub.
342	// Let i:len denote a counted interval of length n starting at i. s denotes a generalized-sum value.
343	// Expected actions of the functors are:
344	// reduce(i,len) -> s -- return reduction value of i:len.
345	// combine(s1,s2) -> s -- return merged sum
346	// apex(s) -- do any processing necessary between reduce and scan.
347	// scan(i,len,initial) -- perform scan over i:len starting with initial.
348	// The initial range 0:n is partitioned into consecutive subranges.
349	// reduce and scan are each called exactly once per subrange.
350	// Thus callers can rely upon side effects in reduce.
351	// combine must not throw an exception.
352	// apex is called exactly once, after all calls to reduce and before all calls to scan.
353	// For example, it's useful for allocating a __buffer used by scan but whose size is the sum of all reduction values.
354	// T must have a trivial constructor and destructor.
355	template <class _ExecutionPolicy, typename _Index, typename _Tp, typename _Rp, typename _Cp, typename _Sp, typename _Ap>
356	void
357	__parallel_strict_scan(_ExecutionPolicy&&, _Index __n, _Tp __initial, _Rp __reduce, _Cp __combine, _Sp __scan,
358	_Ap __apex)
359	{
360	tbb::this_task_arena::isolate([=, &__combine]() {
361	if (__n > `1`)
362	{
363	_Index __p = tbb::this_task_arena::max_concurrency();
364	const _Index __slack = `4`;
365	_Index __tilesize = (__n - `1`) / (__slack * __p) + `1`;
366	_Index __m = (__n - `1`) / __tilesize;
367	__buffer<_Tp> __buf(__m + `1`);
368	_Tp* __r = __buf.get();
369	__tbb_backend::__upsweep(_Index(`0`), _Index(__m + `1`), __tilesize, __r, __n - __m * __tilesize, __reduce,
370	__combine);
371
372	// When __apex is a no-op and __combine has no side effects, a good optimizer
373	// should be able to eliminate all code between here and __apex.
374	// Alternatively, provide a default value for __apex that can be
375	// recognized by metaprogramming that conditionlly executes the following.
376	size_t __k = __m + `1`;
377	_Tp __t = __r[__k - `1`];
378	while ((__k &= __k - `1`))
379	__t = __combine(__r[__k - `1`], __t);
380	__apex(__combine(__initial, __t));
381	__tbb_backend::__downsweep(_Index(`0`), _Index(__m + `1`), __tilesize, __r, __n - __m * __tilesize, __initial,
382	__combine, __scan);
383	return;
384	}
385	// Fewer than 2 elements in sequence, or out of memory. Handle has single block.
386	_Tp __sum = __initial;
387	if (__n)
388	__sum = __combine(__sum, __reduce(_Index(`0`), __n));
389	__apex(__sum);
390	if (__n)
391	__scan(_Index(`0`), __n, __initial);
392	});
393	}
394
395	template <class _ExecutionPolicy, class _Index, class _Up, class _Tp, class _Cp, class _Rp, class _Sp>
396	_Tp
397	__parallel_transform_scan(_ExecutionPolicy&&, _Index __n, _Up __u, _Tp __init, _Cp __combine, _Rp __brick_reduce,
398	_Sp __scan)
399	{
400	__trans_scan_body<_Index, _Up, _Tp, _Cp, _Rp, _Sp> __body(__u, __init, __combine, __brick_reduce, __scan);
401	auto __range = tbb::blocked_range<_Index>(`0`, __n);
402	tbb::this_task_arena::isolate([__range, &__body]() { tbb::parallel_scan(__range, __body); });
403	return __body.sum();
404	}
405
406	//------------------------------------------------------------------------
407	// parallel_stable_sort
408	//------------------------------------------------------------------------
409
410	//------------------------------------------------------------------------
411	// stable_sort utilities
412	//
413	// These are used by parallel implementations but do not depend on them.
414	//------------------------------------------------------------------------
415	#define _PSTL_MERGE_CUT_OFF 2000
416
417	template <typename _Func>
418	class __func_task;
419	template <typename _Func>
420	class __root_task;
421
422	#if TBB_INTERFACE_VERSION <= 12000
423	class __task : public tbb::task
424	{
425	public:
426	template <typename _Fn>
427	__task*
428	make_continuation(_Fn&& __f)
429	{
430	return new (allocate_continuation()) __func_task<typename std::decay<_Fn>::type>(std::forward<_Fn>(__f));
431	}
432
433	template <typename _Fn>
434	__task*
435	make_child_of(__task* parent, _Fn&& __f)
436	{
437	return new (parent->allocate_child()) __func_task<typename std::decay<_Fn>::type>(std::forward<_Fn>(__f));
438	}
439
440	template <typename _Fn>
441	__task*
442	make_additional_child_of(tbb::task* parent, _Fn&& __f)
443	{
444	return new (tbb::task::allocate_additional_child_of(*parent))
445	__func_task<typename std::decay<_Fn>::type>(std::forward<_Fn>(__f));
446	}
447
448	inline void
449	recycle_as_continuation()
450	{
451	tbb::task::recycle_as_continuation();
452	}
453
454	inline void
455	recycle_as_child_of(__task* parent)
456	{
457	tbb::task::recycle_as_child_of(*parent);
458	}
459
460	inline void
461	spawn(__task* __t)
462	{
463	tbb::task::spawn(*__t);
464	}
465
466	template <typename _Fn>
467	static inline void
468	spawn_root_and_wait(__root_task<_Fn>& __root)
469	{
470	tbb::task::spawn_root_and_wait(*__root._M_task);
471	}
472	};
473
474	template <typename _Func>
475	class __func_task : public __task
476	{
477	_Func _M_func;
478
479	tbb::task*
480	execute()
481	{
482	return _M_func(this);
483	};
484
485	public:
486	template <typename _Fn>
487	__func_task(_Fn&& __f) : _M_func{std::forward<_Fn>(__f)}
488	{
489	}
490
491	_Func&
492	body()
493	{
494	return _M_func;
495	}
496	};
497
498	template <typename _Func>
499	class __root_task
500	{
501	tbb::task* _M_task;
502
503	public:
504	template <typename... Args>
505	__root_task(Args&&... args)
506	: _M_task{new (tbb::task::allocate_root()) __func_task<_Func>{_Func(std::forward<Args>(args)...)}}
507	{
508	}
509
510	friend class __task;
511	friend class __func_task<_Func>;
512	};
513
514	#else // TBB_INTERFACE_VERSION <= 12000
515	class __task : public tbb::detail::d1::task
516	{
517	protected:
518	tbb::detail::d1::small_object_allocator _M_allocator{};
519	tbb::detail::d1::execution_data* _M_execute_data{};
520	__task* _M_parent{};
521	std::atomic<int> _M_refcount{};
522	bool _M_recycle{};
523
524	template <typename _Fn>
525	__task*
526	allocate_func_task(_Fn&& __f)
527	{
528	_PSTL_ASSERT(_M_execute_data != nullptr);
529	tbb::detail::d1::small_object_allocator __alloc{};
530	auto __t =
531	__alloc.new_object<__func_task<typename std::decay<_Fn>::type>>(*_M_execute_data, std::forward<_Fn>(__f));
532	__t->_M_allocator = __alloc;
533	return __t;
534	}
535
536	public:
537	__task*
538	parent()
539	{
540	return _M_parent;
541	}
542
543	void
544	set_ref_count(int __n)
545	{
546	_M_refcount.store(i: __n, m: std::memory_order_release);
547	}
548
549	template <typename _Fn>
550	__task*
551	make_continuation(_Fn&& __f)
552	{
553	auto __t = allocate_func_task(std::forward<_Fn&&>(__f));
554	__t->_M_parent = _M_parent;
555	_M_parent = nullptr;
556	return __t;
557	}
558
559	template <typename _Fn>
560	__task*
561	make_child_of(__task* __parent, _Fn&& __f)
562	{
563	auto __t = allocate_func_task(std::forward<_Fn&&>(__f));
564	__t->_M_parent = __parent;
565	return __t;
566	}
567
568	template <typename _Fn>
569	__task*
570	make_additional_child_of(__task* __parent, _Fn&& __f)
571	{
572	auto __t = make_child_of(__parent, std::forward<_Fn>(__f));
573	_PSTL_ASSERT(__parent->_M_refcount.load(std::memory_order_relaxed) > `0`);
574	++__parent->_M_refcount;
575	return __t;
576	}
577
578	inline void
579	recycle_as_continuation()
580	{
581	_M_recycle = true;
582	}
583
584	inline void
585	recycle_as_child_of(__task* parent)
586	{
587	_M_recycle = true;
588	_M_parent = parent;
589	}
590
591	inline void
592	spawn(__task* __t)
593	{
594	_PSTL_ASSERT(_M_execute_data != nullptr);
595	tbb::detail::d1::spawn(t&: __t, ctx&: _M_execute_data->context);
596	}
597
598	template <typename _Fn>
599	static inline void
600	spawn_root_and_wait(__root_task<_Fn>& __root)
601	{
602	tbb::detail::d1::execute_and_wait(t&: *__root._M_func_task, t_ctx&: __root._M_context, wait_ctx&: __root._M_wait_object,
603	w_ctx&: __root._M_context);
604	}
605
606	template <typename _Func>
607	friend class __func_task;
608	};
609
610	template <typename _Func>
611	class __func_task : public __task
612	{
613	_Func _M_func;
614
615	__task*
616	execute(tbb::detail::d1::execution_data& __ed) override
617	{
618	_M_execute_data = &__ed;
619	_M_recycle = false;
620	__task* __next = _M_func(this);
621	return finalize(__next);
622	};
623
624	__task*
625	cancel(tbb::detail::d1::execution_data& __ed) override
626	{
627	return finalize(next: nullptr);
628	}
629
630	__task*
631	finalize(__task* __next)
632	{
633	bool __recycle = _M_recycle;
634	_M_recycle = false;
635
636	if (__recycle)
637	{
638	return __next;
639	}
640
641	auto __parent = _M_parent;
642	auto __alloc = _M_allocator;
643	auto __ed = _M_execute_data;
644
645	this->~__func_task();
646
647	_PSTL_ASSERT(__parent != nullptr);
648	_PSTL_ASSERT(__parent->_M_refcount.load(std::memory_order_relaxed) > `0`);
649	if (--__parent->_M_refcount == `0`)
650	{
651	_PSTL_ASSERT(__next == nullptr);
652	__alloc.deallocate(this, *__ed);
653	return __parent;
654	}
655
656	return __next;
657	}
658
659	friend class __root_task<_Func>;
660
661	public:
662	template <typename _Fn>
663	__func_task(_Fn&& __f) : _M_func(std::forward<_Fn>(__f))
664	{
665	}
666
667	_Func&
668	body()
669	{
670	return _M_func;
671	}
672	};
673
674	template <typename _Func>
675	class __root_task : public __task
676	{
677	__task*
678	execute(tbb::detail::d1::execution_data& __ed) override
679	{
680	_M_wait_object.release();
681	return nullptr;
682	};
683
684	__task*
685	cancel(tbb::detail::d1::execution_data& __ed) override
686	{
687	_M_wait_object.release();
688	return nullptr;
689	}
690
691	__func_task<_Func>* _M_func_task{};
692	tbb::detail::d1::wait_context _M_wait_object{`0`};
693	tbb::task_group_context _M_context{};
694
695	public:
696	template <typename... Args>
697	__root_task(Args&&... args) : _M_wait_object {`1`}
698	{
699	tbb::detail::d1::small_object_allocator __alloc{};
700	_M_func_task = __alloc.new_object<__func_task<_Func>>(_Func(std::forward<Args>(args)...));
701	_M_func_task->_M_allocator = __alloc;
702	_M_func_task->_M_parent = this;
703	_M_refcount.store(`1`, std::memory_order_relaxed);
704	}
705
706	friend class __task;
707	};
708	#endif // TBB_INTERFACE_VERSION <= 12000
709
710	template <typename _RandomAccessIterator1, typename _RandomAccessIterator2, typename _Compare, typename _Cleanup,
711	typename _LeafMerge>
712	class __merge_func
713	{
714	typedef typename std::iterator_traits<_RandomAccessIterator1>::difference_type _DifferenceType1;
715	typedef typename std::iterator_traits<_RandomAccessIterator2>::difference_type _DifferenceType2;
716	typedef typename std::common_type<_DifferenceType1, _DifferenceType2>::type _SizeType;
717	typedef typename std::iterator_traits<_RandomAccessIterator1>::value_type _ValueType;
718
719	_RandomAccessIterator1 _M_x_beg;
720	_RandomAccessIterator2 _M_z_beg;
721
722	_SizeType _M_xs, _M_xe;
723	_SizeType _M_ys, _M_ye;
724	_SizeType _M_zs;
725	_Compare _M_comp;
726	_LeafMerge _M_leaf_merge;
727	_SizeType _M_nsort; //number of elements to be sorted for partial_sort alforithm
728
729	static const _SizeType __merge_cut_off = _PSTL_MERGE_CUT_OFF;
730
731	bool _root; //means a task is merging root task
732	bool _x_orig; //"true" means X(or left ) subrange is in the original container; false - in the buffer
733	bool _y_orig; //"true" means Y(or right) subrange is in the original container; false - in the buffer
734	bool _split; //"true" means a merge task is a split task for parallel merging, the execution logic differs
735
736	bool
737	is_partial() const
738	{
739	return _M_nsort > `0`;
740	}
741
742	struct __move_value
743	{
744	template <typename Iterator1, typename Iterator2>
745	void
746	operator()(Iterator1 __x, Iterator2 __z)
747	{
748	__z = std::move(__x);
749	}
750	};
751
752	struct __move_value_construct
753	{
754	template <typename Iterator1, typename Iterator2>
755	void
756	operator()(Iterator1 __x, Iterator2 __z)
757	{
758	::new (std::addressof(__z)) _ValueType(std::move(__x));
759	}
760	};
761
762	struct __move_range
763	{
764	template <typename Iterator1, typename Iterator2>
765	Iterator2
766	operator()(Iterator1 __first1, Iterator1 __last1, Iterator2 __first2)
767	{
768	if (__last1 - __first1 < __merge_cut_off)
769	return std::move(__first1, __last1, __first2);
770
771	auto __n = __last1 - __first1;
772	tbb::parallel_for(tbb::blocked_range<_SizeType>(`0`, __n, __merge_cut_off),
773	[__first1, __first2](const tbb::blocked_range<_SizeType>& __range) {
774	std::move(__first1 + __range.begin(), __first1 + __range.end(),
775	__first2 + __range.begin());
776	});
777	return __first2 + __n;
778	}
779	};
780
781	struct __move_range_construct
782	{
783	template <typename Iterator1, typename Iterator2>
784	Iterator2
785	operator()(Iterator1 __first1, Iterator1 __last1, Iterator2 __first2)
786	{
787	if (__last1 - __first1 < __merge_cut_off)
788	{
789	for (; __first1 != __last1; ++__first1, ++__first2)
790	__move_value_construct()(__first1, __first2);
791	return __first2;
792	}
793
794	auto __n = __last1 - __first1;
795	tbb::parallel_for(tbb::blocked_range<_SizeType>(`0`, __n, __merge_cut_off),
796	[__first1, __first2](const tbb::blocked_range<_SizeType>& __range) {
797	for (auto i = __range.begin(); i != __range.end(); ++i)
798	__move_value_construct()(__first1 + i, __first2 + i);
799	});
800	return __first2 + __n;
801	}
802	};
803
804	struct __cleanup_range
805	{
806	template <typename Iterator>
807	void
808	operator()(Iterator __first, Iterator __last)
809	{
810	if (__last - __first < __merge_cut_off)
811	_Cleanup()(__first, __last);
812	else
813	{
814	auto __n = __last - __first;
815	tbb::parallel_for(tbb::blocked_range<_SizeType>(`0`, __n, __merge_cut_off),
816	[__first](const tbb::blocked_range<_SizeType>& __range) {
817	_Cleanup()(__first + __range.begin(), __first + __range.end());
818	});
819	}
820	}
821	};
822
823	public:
824	__merge_func(_SizeType __xs, _SizeType __xe, _SizeType __ys, _SizeType __ye, _SizeType __zs, _Compare __comp,
825	_Cleanup, _LeafMerge __leaf_merge, _SizeType __nsort, _RandomAccessIterator1 __x_beg,
826	_RandomAccessIterator2 __z_beg, bool __x_orig, bool __y_orig, bool __root)
827	: _M_xs(__xs), _M_xe(__xe), _M_ys(__ys), _M_ye(__ye), _M_zs(__zs), _M_x_beg(__x_beg), _M_z_beg(__z_beg),
828	_M_comp(__comp), _M_leaf_merge(__leaf_merge), _M_nsort(__nsort), _root(__root),
829	_x_orig(__x_orig), _y_orig(__y_orig), _split(false)
830	{
831	}
832
833	bool
834	is_left(_SizeType __idx) const
835	{
836	return _M_xs == __idx;
837	}
838
839	template <typename IndexType>
840	void
841	set_odd(IndexType __idx, bool __on_off)
842	{
843	if (is_left(__idx))
844	_x_orig = __on_off;
845	else
846	_y_orig = __on_off;
847	}
848
849	__task*
850	operator()(__task* __self);
851
852	private:
853	__merge_func*
854	parent_merge(__task* __self) const
855	{
856	return _root ? nullptr : &static_cast<__func_task<__merge_func>*>(__self->parent())->body();
857	}
858	bool
859	x_less_y()
860	{
861	const auto __nx = (_M_xe - _M_xs);
862	const auto __ny = (_M_ye - _M_ys);
863	_PSTL_ASSERT(__nx > `0` && __ny > `0`);
864
865	_PSTL_ASSERT(_x_orig == _y_orig);
866	_PSTL_ASSERT(!is_partial());
867
868	if (_x_orig)
869	{
870	_PSTL_ASSERT(std::is_sorted(_M_x_beg + _M_xs, _M_x_beg + _M_xe, _M_comp));
871	_PSTL_ASSERT(std::is_sorted(_M_x_beg + _M_ys, _M_x_beg + _M_ye, _M_comp));
872	return !_M_comp((_M_x_beg + _M_ys), (_M_x_beg + _M_xe - `1`));
873	}
874
875	_PSTL_ASSERT(std::is_sorted(_M_z_beg + _M_xs, _M_z_beg + _M_xe, _M_comp));
876	_PSTL_ASSERT(std::is_sorted(_M_z_beg + _M_ys, _M_z_beg + _M_ye, _M_comp));
877	return !_M_comp((_M_z_beg + _M_zs + __nx), (_M_z_beg + _M_zs + __nx - `1`));
878	}
879	void
880	move_x_range()
881	{
882	const auto __nx = (_M_xe - _M_xs);
883	const auto __ny = (_M_ye - _M_ys);
884	_PSTL_ASSERT(__nx > `0` && __ny > `0`);
885
886	if (_x_orig)
887	__move_range_construct()(_M_x_beg + _M_xs, _M_x_beg + _M_xe, _M_z_beg + _M_zs);
888	else
889	{
890	__move_range()(_M_z_beg + _M_zs, _M_z_beg + _M_zs + __nx, _M_x_beg + _M_xs);
891	__cleanup_range()(_M_z_beg + _M_zs, _M_z_beg + _M_zs + __nx);
892	}
893
894	_x_orig = !_x_orig;
895	}
896	void
897	move_y_range()
898	{
899	const auto __nx = (_M_xe - _M_xs);
900	const auto __ny = (_M_ye - _M_ys);
901
902	if (_y_orig)
903	__move_range_construct()(_M_x_beg + _M_ys, _M_x_beg + _M_ye, _M_z_beg + _M_zs + __nx);
904	else
905	{
906	__move_range()(_M_z_beg + _M_zs + __nx, _M_z_beg + _M_zs + __nx + __ny, _M_x_beg + _M_ys);
907	__cleanup_range()(_M_z_beg + _M_zs + __nx, _M_z_beg + _M_zs + __nx + __ny);
908	}
909
910	_y_orig = !_y_orig;
911	}
912	__task*
913	merge_ranges(__task* __self)
914	{
915	_PSTL_ASSERT(_x_orig == _y_orig); //two merged subrange must be lie into the same buffer
916
917	const auto __nx = (_M_xe - _M_xs);
918	const auto __ny = (_M_ye - _M_ys);
919	const auto __n = __nx + __ny;
920
921	// need to merge {x} and {y}
922	if (__n > __merge_cut_off)
923	return split_merging(__self);
924
925	//merge to buffer
926	if (_x_orig)
927	{
928	_M_leaf_merge(_M_x_beg + _M_xs, _M_x_beg + _M_xe, _M_x_beg + _M_ys, _M_x_beg + _M_ye, _M_z_beg + _M_zs,
929	_M_comp, __move_value_construct(), __move_value_construct(), __move_range_construct(),
930	__move_range_construct());
931	_PSTL_ASSERT(parent_merge(__self)); //not root merging task
932	}
933	//merge to "origin"
934	else
935	{
936	_PSTL_ASSERT(_x_orig == _y_orig);
937
938	_PSTL_ASSERT(is_partial() \|\| std::is_sorted(_M_z_beg + _M_xs, _M_z_beg + _M_xe, _M_comp));
939	_PSTL_ASSERT(is_partial() \|\| std::is_sorted(_M_z_beg + _M_ys, _M_z_beg + _M_ye, _M_comp));
940
941	const auto __nx = (_M_xe - _M_xs);
942	const auto __ny = (_M_ye - _M_ys);
943
944	_M_leaf_merge(_M_z_beg + _M_xs, _M_z_beg + _M_xe, _M_z_beg + _M_ys, _M_z_beg + _M_ye, _M_x_beg + _M_zs,
945	_M_comp, __move_value(), __move_value(), __move_range(), __move_range());
946
947	__cleanup_range()(_M_z_beg + _M_xs, _M_z_beg + _M_xe);
948	__cleanup_range()(_M_z_beg + _M_ys, _M_z_beg + _M_ye);
949	}
950	return nullptr;
951	}
952
953	__task*
954	process_ranges(__task* __self)
955	{
956	_PSTL_ASSERT(_x_orig == _y_orig);
957	_PSTL_ASSERT(!_split);
958
959	auto p = parent_merge(__self);
960
961	if (!p)
962	{ //root merging task
963
964	//optimization, just for sort algorithm, //{x} <= {y}
965	if (!is_partial() && x_less_y()) //we have a solution
966	{
967	if (!_x_orig)
968	{ //we have to move the solution to the origin
969	move_x_range(); //parallel moving
970	move_y_range(); //parallel moving
971	}
972	return nullptr;
973	}
974	//else: if we have data in the origin,
975	//we have to move data to the buffer for final merging into the origin.
976	if (_x_orig)
977	{
978	move_x_range(); //parallel moving
979	move_y_range(); //parallel moving
980	}
981	// need to merge {x} and {y}.
982	return merge_ranges(__self);
983	}
984	//else: not root merging task (parent_merge() == NULL)
985	//optimization, just for sort algorithm, //{x} <= {y}
986	if (!is_partial() && x_less_y())
987	{
988	const auto id_range = _M_zs;
989	p->set_odd(id_range, _x_orig);
990	return nullptr;
991	}
992	//else: we have to revert "_x(y)_orig" flag of the parent merging task
993	const auto id_range = _M_zs;
994	p->set_odd(id_range, !_x_orig);
995
996	return merge_ranges(__self);
997	}
998
999	//splitting as merge task into 2 of the same level
1000	__task*
1001	split_merging(__task* __self)
1002	{
1003	_PSTL_ASSERT(_x_orig == _y_orig);
1004	const auto __nx = (_M_xe - _M_xs);
1005	const auto __ny = (_M_ye - _M_ys);
1006
1007	_SizeType __xm{};
1008	_SizeType __ym{};
1009	if (__nx < __ny)
1010	{
1011	__ym = _M_ys + __ny / `2`;
1012
1013	if (_x_orig)
1014	__xm = std::upper_bound(_M_x_beg + _M_xs, _M_x_beg + _M_xe, *(_M_x_beg + __ym), _M_comp) - _M_x_beg;
1015	else
1016	__xm = std::upper_bound(_M_z_beg + _M_xs, _M_z_beg + _M_xe, *(_M_z_beg + __ym), _M_comp) - _M_z_beg;
1017	}
1018	else
1019	{
1020	__xm = _M_xs + __nx / `2`;
1021
1022	if (_y_orig)
1023	__ym = std::lower_bound(_M_x_beg + _M_ys, _M_x_beg + _M_ye, *(_M_x_beg + __xm), _M_comp) - _M_x_beg;
1024	else
1025	__ym = std::lower_bound(_M_z_beg + _M_ys, _M_z_beg + _M_ye, *(_M_z_beg + __xm), _M_comp) - _M_z_beg;
1026	}
1027
1028	auto __zm = _M_zs + ((__xm - _M_xs) + (__ym - _M_ys));
1029	__merge_func __right_func(__xm, _M_xe, __ym, _M_ye, __zm, _M_comp, _Cleanup(), _M_leaf_merge, _M_nsort,
1030	_M_x_beg, _M_z_beg, _x_orig, _y_orig, _root);
1031	__right_func._split = true;
1032	auto __merge_task = __self->make_additional_child_of(__self->parent(), std::move(__right_func));
1033	__self->spawn(t: __merge_task);
1034	__self->recycle_as_continuation();
1035
1036	_M_xe = __xm;
1037	_M_ye = __ym;
1038	_split = true;
1039
1040	return __self;
1041	}
1042	};
1043
1044	template <typename _RandomAccessIterator1, typename _RandomAccessIterator2, typename __M_Compare, typename _Cleanup,
1045	typename _LeafMerge>
1046	__task*
1047	__merge_func<_RandomAccessIterator1, _RandomAccessIterator2, __M_Compare, _Cleanup, _LeafMerge>::
1048	operator()(__task* __self)
1049	{
1050	//a. split merge task into 2 of the same level; the special logic,
1051	//without processing(process_ranges) adjacent sub-ranges x and y
1052	if (_split)
1053	return merge_ranges(__self);
1054
1055	//b. General merging of adjacent sub-ranges x and y (with optimization in case of {x} <= {y} )
1056
1057	//1. x and y are in the even buffer
1058	//2. x and y are in the odd buffer
1059	if (_x_orig == _y_orig)
1060	return process_ranges(__self);
1061
1062	//3. x is in even buffer, y is in the odd buffer
1063	//4. x is in odd buffer, y is in the even buffer
1064	if (!parent_merge(__self))
1065	{ //root merge task
1066	if (_x_orig)
1067	move_x_range();
1068	else
1069	move_y_range();
1070	}
1071	else
1072	{
1073	const _SizeType __nx = (_M_xe - _M_xs);
1074	const _SizeType __ny = (_M_ye - _M_ys);
1075	_PSTL_ASSERT(__nx > `0`);
1076	_PSTL_ASSERT(__nx > `0`);
1077
1078	if (__nx < __ny)
1079	move_x_range();
1080	else
1081	move_y_range();
1082	}
1083
1084	return process_ranges(__self);
1085	}
1086
1087	template <typename _RandomAccessIterator1, typename _RandomAccessIterator2, typename _Compare, typename _LeafSort>
1088	class __stable_sort_func
1089	{
1090	public:
1091	typedef typename std::iterator_traits<_RandomAccessIterator1>::difference_type _DifferenceType1;
1092	typedef typename std::iterator_traits<_RandomAccessIterator2>::difference_type _DifferenceType2;
1093	typedef typename std::common_type<_DifferenceType1, _DifferenceType2>::type _SizeType;
1094
1095	private:
1096	_RandomAccessIterator1 _M_xs, _M_xe, _M_x_beg;
1097	_RandomAccessIterator2 _M_zs, _M_z_beg;
1098	_Compare _M_comp;
1099	_LeafSort _M_leaf_sort;
1100	bool _M_root;
1101	_SizeType _M_nsort; //zero or number of elements to be sorted for partial_sort alforithm
1102
1103	public:
1104	__stable_sort_func(_RandomAccessIterator1 __xs, _RandomAccessIterator1 __xe, _RandomAccessIterator2 __zs,
1105	bool __root, _Compare __comp, _LeafSort __leaf_sort, _SizeType __nsort,
1106	_RandomAccessIterator1 __x_beg, _RandomAccessIterator2 __z_beg)
1107	: _M_xs(__xs), _M_xe(__xe), _M_x_beg(__x_beg), _M_zs(__zs), _M_z_beg(__z_beg), _M_comp(__comp),
1108	_M_leaf_sort(__leaf_sort), _M_root(__root), _M_nsort(__nsort)
1109	{
1110	}
1111
1112	__task*
1113	operator()(__task* __self);
1114	};
1115
1116	#define _PSTL_STABLE_SORT_CUT_OFF 500
1117
1118	template <typename _RandomAccessIterator1, typename _RandomAccessIterator2, typename _Compare, typename _LeafSort>
1119	__task*
1120	__stable_sort_func<_RandomAccessIterator1, _RandomAccessIterator2, _Compare, _LeafSort>::operator()(__task* __self)
1121	{
1122	typedef __merge_func<_RandomAccessIterator1, _RandomAccessIterator2, _Compare, __utils::__serial_destroy,
1123	__utils::__serial_move_merge>
1124	_MergeTaskType;
1125
1126	const _SizeType __n = _M_xe - _M_xs;
1127	const _SizeType __nmerge = _M_nsort > `0` ? _M_nsort : __n;
1128	const _SizeType __sort_cut_off = _PSTL_STABLE_SORT_CUT_OFF;
1129	if (__n <= __sort_cut_off)
1130	{
1131	_M_leaf_sort(_M_xs, _M_xe, _M_comp);
1132	_PSTL_ASSERT(!_M_root);
1133	return nullptr;
1134	}
1135
1136	const _RandomAccessIterator1 __xm = _M_xs + __n / `2`;
1137	const _RandomAccessIterator2 __zm = _M_zs + (__xm - _M_xs);
1138	const _RandomAccessIterator2 __ze = _M_zs + __n;
1139	_MergeTaskType __m(_MergeTaskType(_M_xs - _M_x_beg, __xm - _M_x_beg, __xm - _M_x_beg, _M_xe - _M_x_beg,
1140	_M_zs - _M_z_beg, _M_comp, __utils::__serial_destroy (),
1141	__utils::__serial_move_merge(__nmerge), _M_nsort, _M_x_beg, _M_z_beg,
1142	/x_orig/ true, /y_orig/ true, /root/ _M_root));
1143	auto __parent = __self->make_continuation(std::move(__m));
1144	__parent->set_ref_count(`2`);
1145	auto __right = __self->make_child_of(
1146	__parent, __stable_sort_func(__xm, _M_xe, __zm, false, _M_comp, _M_leaf_sort, _M_nsort, _M_x_beg, _M_z_beg));
1147	__self->spawn(t: __right);
1148	__self->recycle_as_child_of(parent: __parent);
1149	_M_root = false;
1150	_M_xe = __xm;
1151
1152	return __self;
1153	}
1154
1155	template <class _ExecutionPolicy, typename _RandomAccessIterator, typename _Compare, typename _LeafSort>
1156	void
1157	__parallel_stable_sort(_ExecutionPolicy&&, _RandomAccessIterator __xs, _RandomAccessIterator __xe, _Compare __comp,
1158	_LeafSort __leaf_sort, std::size_t __nsort = `0`)
1159	{
1160	tbb::this_task_arena::isolate([=, &__nsort]() {
1161	//sorting based on task tree and parallel merge
1162	typedef typename std::iterator_traits<_RandomAccessIterator>::value_type _ValueType;
1163	typedef typename std::iterator_traits<_RandomAccessIterator>::difference_type _DifferenceType;
1164	const _DifferenceType __n = __xe - __xs;
1165	if (__nsort == __n)
1166	__nsort = `0`; // 'partial_sort' becames 'sort'
1167
1168	const _DifferenceType __sort_cut_off = _PSTL_STABLE_SORT_CUT_OFF;
1169	if (__n > __sort_cut_off)
1170	{
1171	__buffer<_ValueType> __buf(__n);
1172	__root_task<__stable_sort_func<_RandomAccessIterator, _ValueType*, _Compare, _LeafSort>> __root{
1173	__xs, __xe, __buf.get(), true, __comp, __leaf_sort, __nsort, __xs, __buf.get()};
1174	__task::spawn_root_and_wait(__root);
1175	return;
1176	}
1177	//serial sort
1178	__leaf_sort(__xs, __xe, __comp);
1179	});
1180	}
1181
1182	//------------------------------------------------------------------------
1183	// parallel_merge
1184	//------------------------------------------------------------------------
1185	template <typename _RandomAccessIterator1, typename _RandomAccessIterator2, typename _RandomAccessIterator3,
1186	typename _Compare, typename _LeafMerge>
1187	class __merge_func_static
1188	{
1189	_RandomAccessIterator1 _M_xs, _M_xe;
1190	_RandomAccessIterator2 _M_ys, _M_ye;
1191	_RandomAccessIterator3 _M_zs;
1192	_Compare _M_comp;
1193	_LeafMerge _M_leaf_merge;
1194
1195	public:
1196	__merge_func_static(_RandomAccessIterator1 __xs, _RandomAccessIterator1 __xe, _RandomAccessIterator2 __ys,
1197	_RandomAccessIterator2 __ye, _RandomAccessIterator3 __zs, _Compare __comp,
1198	_LeafMerge __leaf_merge)
1199	: _M_xs(__xs), _M_xe(__xe), _M_ys(__ys), _M_ye(__ye), _M_zs(__zs), _M_comp(__comp), _M_leaf_merge(__leaf_merge)
1200	{
1201	}
1202
1203	__task*
1204	operator()(__task* __self);
1205	};
1206
1207	//TODO: consider usage of parallel_for with a custom blocked_range
1208	template <typename _RandomAccessIterator1, typename _RandomAccessIterator2, typename _RandomAccessIterator3,
1209	typename __M_Compare, typename _LeafMerge>
1210	__task*
1211	__merge_func_static<_RandomAccessIterator1, _RandomAccessIterator2, _RandomAccessIterator3, __M_Compare, _LeafMerge>::
1212	operator()(__task* __self)
1213	{
1214	typedef typename std::iterator_traits<_RandomAccessIterator1>::difference_type _DifferenceType1;
1215	typedef typename std::iterator_traits<_RandomAccessIterator2>::difference_type _DifferenceType2;
1216	typedef typename std::common_type<_DifferenceType1, _DifferenceType2>::type _SizeType;
1217	const _SizeType __n = (_M_xe - _M_xs) + (_M_ye - _M_ys);
1218	const _SizeType __merge_cut_off = _PSTL_MERGE_CUT_OFF;
1219	if (__n <= __merge_cut_off)
1220	{
1221	_M_leaf_merge(_M_xs, _M_xe, _M_ys, _M_ye, _M_zs, _M_comp);
1222	return nullptr;
1223	}
1224
1225	_RandomAccessIterator1 __xm;
1226	_RandomAccessIterator2 __ym;
1227	if (_M_xe - _M_xs < _M_ye - _M_ys)
1228	{
1229	__ym = _M_ys + (_M_ye - _M_ys) / `2`;
1230	__xm = std::upper_bound(_M_xs, _M_xe, *__ym, _M_comp);
1231	}
1232	else
1233	{
1234	__xm = _M_xs + (_M_xe - _M_xs) / `2`;
1235	__ym = std::lower_bound(_M_ys, _M_ye, *__xm, _M_comp);
1236	}
1237	const _RandomAccessIterator3 __zm = _M_zs + ((__xm - _M_xs) + (__ym - _M_ys));
1238	auto __right = __self->make_additional_child_of(
1239	__self->parent(), __merge_func_static(__xm, _M_xe, __ym, _M_ye, __zm, _M_comp, _M_leaf_merge));
1240	__self->spawn(t: __right);
1241	__self->recycle_as_continuation();
1242	_M_xe = __xm;
1243	_M_ye = __ym;
1244
1245	return __self;
1246	}
1247
1248	template <class _ExecutionPolicy, typename _RandomAccessIterator1, typename _RandomAccessIterator2,
1249	typename _RandomAccessIterator3, typename _Compare, typename _LeafMerge>
1250	void
1251	__parallel_merge(_ExecutionPolicy&&, _RandomAccessIterator1 __xs, _RandomAccessIterator1 __xe,
1252	_RandomAccessIterator2 __ys, _RandomAccessIterator2 __ye, _RandomAccessIterator3 __zs, _Compare __comp,
1253	_LeafMerge __leaf_merge)
1254	{
1255	typedef typename std::iterator_traits<_RandomAccessIterator1>::difference_type _DifferenceType1;
1256	typedef typename std::iterator_traits<_RandomAccessIterator2>::difference_type _DifferenceType2;
1257	typedef typename std::common_type<_DifferenceType1, _DifferenceType2>::type _SizeType;
1258	const _SizeType __n = (__xe - __xs) + (__ye - __ys);
1259	const _SizeType __merge_cut_off = _PSTL_MERGE_CUT_OFF;
1260	if (__n <= __merge_cut_off)
1261	{
1262	// Fall back on serial merge
1263	__leaf_merge(__xs, __xe, __ys, __ye, __zs, __comp);
1264	}
1265	else
1266	{
1267	tbb::this_task_arena::isolate([=]() {
1268	typedef __merge_func_static<_RandomAccessIterator1, _RandomAccessIterator2, _RandomAccessIterator3,
1269	_Compare, _LeafMerge>
1270	_TaskType;
1271	__root_task<_TaskType> __root{__xs, __xe, __ys, __ye, __zs, __comp, __leaf_merge};
1272	__task::spawn_root_and_wait(__root);
1273	});
1274	}
1275	}
1276
1277	//------------------------------------------------------------------------
1278	// parallel_invoke
1279	//------------------------------------------------------------------------
1280	template <class _ExecutionPolicy, typename _F1, typename _F2>
1281	void
1282	__parallel_invoke(_ExecutionPolicy&&, _F1&& __f1, _F2&& __f2)
1283	{
1284	//TODO: a version of tbb::this_task_arena::isolate with variadic arguments pack should be added in the future
1285	tbb::this_task_arena::isolate([&]() { tbb::parallel_invoke(std::forward<_F1>(__f1), std::forward<_F2>(__f2)); });
1286	}
1287
1288	} // namespace __tbb_backend
1289	} // namespace __pstl
1290
1291	#endif /* _PSTL_PARALLEL_BACKEND_TBB_H */
1292

source code of include/c++/11/pstl/parallel_backend_tbb.h