Warning: This file is not a C or C++ file. It does not have highlighting.

1//===----------------------------------------------------------------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8
9#ifndef _LIBCPP___PSTL_BACKENDS_LIBDISPATCH_H
10#define _LIBCPP___PSTL_BACKENDS_LIBDISPATCH_H
11
12#include <__algorithm/inplace_merge.h>
13#include <__algorithm/lower_bound.h>
14#include <__algorithm/max.h>
15#include <__algorithm/merge.h>
16#include <__algorithm/upper_bound.h>
17#include <__atomic/atomic.h>
18#include <__config>
19#include <__cstddef/ptrdiff_t.h>
20#include <__exception/terminate.h>
21#include <__iterator/iterator_traits.h>
22#include <__iterator/move_iterator.h>
23#include <__memory/allocator.h>
24#include <__memory/construct_at.h>
25#include <__memory/destroy.h>
26#include <__memory/unique_ptr.h>
27#include <__new/exceptions.h>
28#include <__numeric/reduce.h>
29#include <__pstl/backend_fwd.h>
30#include <__pstl/cpu_algos/any_of.h>
31#include <__pstl/cpu_algos/cpu_traits.h>
32#include <__pstl/cpu_algos/fill.h>
33#include <__pstl/cpu_algos/find_if.h>
34#include <__pstl/cpu_algos/for_each.h>
35#include <__pstl/cpu_algos/merge.h>
36#include <__pstl/cpu_algos/stable_sort.h>
37#include <__pstl/cpu_algos/transform.h>
38#include <__pstl/cpu_algos/transform_reduce.h>
39#include <__utility/empty.h>
40#include <__utility/exception_guard.h>
41#include <__utility/move.h>
42#include <__utility/pair.h>
43#include <optional>
44
45_LIBCPP_PUSH_MACROS
46#include <__undef_macros>
47
48#if _LIBCPP_STD_VER >= 17
49
50_LIBCPP_BEGIN_NAMESPACE_STD
51namespace __pstl {
52
53namespace __libdispatch {
54// ::dispatch_apply is marked as __attribute__((nothrow)) because it doesn't let exceptions propagate, and neither do
55// we.
56// TODO: Do we want to add [[_Clang::__callback__(__func, __context, __)]]?
57_LIBCPP_EXPORTED_FROM_ABI void
58__dispatch_apply(size_t __chunk_count, void* __context, void (*__func)(void* __context, size_t __chunk)) noexcept;
59
60template <class _Func>
61_LIBCPP_HIDE_FROM_ABI void __dispatch_apply(size_t __chunk_count, _Func __func) noexcept {
62 __libdispatch::__dispatch_apply(__chunk_count, &__func, [](void* __context, size_t __chunk) {
63 (*static_cast<_Func*>(__context))(__chunk);
64 });
65}
66
67struct __chunk_partitions {
68 ptrdiff_t __chunk_count_; // includes the first chunk
69 ptrdiff_t __chunk_size_;
70 ptrdiff_t __first_chunk_size_;
71};
72
73[[__gnu__::__const__]] _LIBCPP_EXPORTED_FROM_ABI __chunk_partitions __partition_chunks(ptrdiff_t __size) noexcept;
74
75template <class _RandomAccessIterator, class _Functor>
76_LIBCPP_HIDE_FROM_ABI optional<__empty>
77__dispatch_parallel_for(__chunk_partitions __partitions, _RandomAccessIterator __first, _Functor __func) {
78 // Perform the chunked execution.
79 __libdispatch::__dispatch_apply(__partitions.__chunk_count_, [&](size_t __chunk) {
80 auto __this_chunk_size = __chunk == 0 ? __partitions.__first_chunk_size_ : __partitions.__chunk_size_;
81 auto __index =
82 __chunk == 0
83 ? 0
84 : (__chunk * __partitions.__chunk_size_) + (__partitions.__first_chunk_size_ - __partitions.__chunk_size_);
85 __func(__first + __index, __first + __index + __this_chunk_size);
86 });
87
88 return __empty{};
89}
90} // namespace __libdispatch
91
92template <>
93struct __cpu_traits<__libdispatch_backend_tag> {
94 template <class _RandomAccessIterator, class _Functor>
95 _LIBCPP_HIDE_FROM_ABI static optional<__empty>
96 __for_each(_RandomAccessIterator __first, _RandomAccessIterator __last, _Functor __func) {
97 return __libdispatch::__dispatch_parallel_for(
98 __libdispatch::__partition_chunks(__last - __first), std::move(__first), std::move(__func));
99 }
100
101 template <class _RandomAccessIterator1, class _RandomAccessIterator2, class _RandomAccessIteratorOut>
102 struct __merge_range {
103 __merge_range(_RandomAccessIterator1 __mid1, _RandomAccessIterator2 __mid2, _RandomAccessIteratorOut __result)
104 : __mid1_(__mid1), __mid2_(__mid2), __result_(__result) {}
105
106 _RandomAccessIterator1 __mid1_;
107 _RandomAccessIterator2 __mid2_;
108 _RandomAccessIteratorOut __result_;
109 };
110
111 template <typename _RandomAccessIterator1,
112 typename _RandomAccessIterator2,
113 typename _RandomAccessIterator3,
114 typename _Compare,
115 typename _LeafMerge>
116 _LIBCPP_HIDE_FROM_ABI static optional<__empty>
117 __merge(_RandomAccessIterator1 __first1,
118 _RandomAccessIterator1 __last1,
119 _RandomAccessIterator2 __first2,
120 _RandomAccessIterator2 __last2,
121 _RandomAccessIterator3 __result,
122 _Compare __comp,
123 _LeafMerge __leaf_merge) noexcept {
124 __libdispatch::__chunk_partitions __partitions =
125 __libdispatch::__partition_chunks(std::max<ptrdiff_t>(__last1 - __first1, __last2 - __first2));
126
127 if (__partitions.__chunk_count_ == 0)
128 return __empty{};
129
130 if (__partitions.__chunk_count_ == 1) {
131 __leaf_merge(__first1, __last1, __first2, __last2, __result, __comp);
132 return __empty{};
133 }
134
135 using __merge_range_t = __merge_range<_RandomAccessIterator1, _RandomAccessIterator2, _RandomAccessIterator3>;
136 auto const __n_ranges = __partitions.__chunk_count_ + 1;
137
138 // TODO: use __uninitialized_buffer
139 auto __destroy = [=](__merge_range_t* __ptr) {
140 std::destroy_n(__ptr, __n_ranges);
141 std::allocator<__merge_range_t>().deallocate(__ptr, __n_ranges);
142 };
143
144 unique_ptr<__merge_range_t[], decltype(__destroy)> __ranges(
145 [&]() -> __merge_range_t* {
146# if _LIBCPP_HAS_EXCEPTIONS
147 try {
148# endif
149 return std::allocator<__merge_range_t>().allocate(__n_ranges);
150# if _LIBCPP_HAS_EXCEPTIONS
151 } catch (const std::bad_alloc&) {
152 return nullptr;
153 }
154# endif
155 }(),
156 __destroy);
157
158 if (!__ranges)
159 return nullopt;
160
161 // TODO: Improve the case where the smaller range is merged into just a few (or even one) chunks of the larger case
162 __merge_range_t* __r = __ranges.get();
163 std::__construct_at(__r++, __first1, __first2, __result);
164
165 bool __iterate_first_range = __last1 - __first1 > __last2 - __first2;
166
167 auto __compute_chunk = [&](size_t __chunk_size) -> __merge_range_t {
168 auto [__mid1, __mid2] = [&] {
169 if (__iterate_first_range) {
170 auto __m1 = __first1 + __chunk_size;
171 auto __m2 = std::lower_bound(__first2, __last2, __m1[-1], __comp);
172 return std::make_pair(__m1, __m2);
173 } else {
174 auto __m2 = __first2 + __chunk_size;
175 auto __m1 = std::lower_bound(__first1, __last1, __m2[-1], __comp);
176 return std::make_pair(__m1, __m2);
177 }
178 }();
179
180 __result += (__mid1 - __first1) + (__mid2 - __first2);
181 __first1 = __mid1;
182 __first2 = __mid2;
183 return {std::move(__mid1), std::move(__mid2), __result};
184 };
185
186 // handle first chunk
187 std::__construct_at(__r++, __compute_chunk(__partitions.__first_chunk_size_));
188
189 // handle 2 -> N - 1 chunks
190 for (ptrdiff_t __i = 0; __i != __partitions.__chunk_count_ - 2; ++__i)
191 std::__construct_at(__r++, __compute_chunk(__partitions.__chunk_size_));
192
193 // handle last chunk
194 std::__construct_at(__r, __last1, __last2, __result);
195
196 __libdispatch::__dispatch_apply(__partitions.__chunk_count_, [&](size_t __index) {
197 auto __first_iters = __ranges[__index];
198 auto __last_iters = __ranges[__index + 1];
199 __leaf_merge(
200 __first_iters.__mid1_,
201 __last_iters.__mid1_,
202 __first_iters.__mid2_,
203 __last_iters.__mid2_,
204 __first_iters.__result_,
205 __comp);
206 });
207
208 return __empty{};
209 }
210
211 template <class _RandomAccessIterator, class _Transform, class _Value, class _Combiner, class _Reduction>
212 _LIBCPP_HIDE_FROM_ABI static optional<_Value> __transform_reduce(
213 _RandomAccessIterator __first,
214 _RandomAccessIterator __last,
215 _Transform __transform,
216 _Value __init,
217 _Combiner __combiner,
218 _Reduction __reduction) {
219 if (__first == __last)
220 return __init;
221
222 auto __partitions = __libdispatch::__partition_chunks(__last - __first);
223
224 auto __destroy = [__count = __partitions.__chunk_count_](_Value* __ptr) {
225 std::destroy_n(__ptr, __count);
226 std::allocator<_Value>().deallocate(__ptr, __count);
227 };
228
229 // TODO: use __uninitialized_buffer
230 // TODO: allocate one element per worker instead of one element per chunk
231 unique_ptr<_Value[], decltype(__destroy)> __values(
232 std::allocator<_Value>().allocate(__partitions.__chunk_count_), __destroy);
233
234 // __dispatch_apply is noexcept
235 __libdispatch::__dispatch_apply(__partitions.__chunk_count_, [&](size_t __chunk) {
236 auto __this_chunk_size = __chunk == 0 ? __partitions.__first_chunk_size_ : __partitions.__chunk_size_;
237 auto __index = __chunk == 0 ? 0
238 : (__chunk * __partitions.__chunk_size_) +
239 (__partitions.__first_chunk_size_ - __partitions.__chunk_size_);
240 if (__this_chunk_size != 1) {
241 std::__construct_at(
242 __values.get() + __chunk,
243 __reduction(__first + __index + 2,
244 __first + __index + __this_chunk_size,
245 __combiner(__transform(__first + __index), __transform(__first + __index + 1))));
246 } else {
247 std::__construct_at(__values.get() + __chunk, __transform(__first + __index));
248 }
249 });
250
251 return std::reduce(
252 std::make_move_iterator(__values.get()),
253 std::make_move_iterator(__values.get() + __partitions.__chunk_count_),
254 std::move(__init),
255 __combiner);
256 }
257
258 template <class _RandomAccessIterator, class _Comp, class _LeafSort>
259 _LIBCPP_HIDE_FROM_ABI static optional<__empty>
260 __stable_sort(_RandomAccessIterator __first, _RandomAccessIterator __last, _Comp __comp, _LeafSort __leaf_sort) {
261 const auto __size = __last - __first;
262 auto __partitions = __libdispatch::__partition_chunks(__size);
263
264 if (__partitions.__chunk_count_ == 0)
265 return __empty{};
266
267 if (__partitions.__chunk_count_ == 1) {
268 __leaf_sort(__first, __last, __comp);
269 return __empty{};
270 }
271
272 using _Value = __iter_value_type<_RandomAccessIterator>;
273
274 auto __destroy = [__size](_Value* __ptr) {
275 std::destroy_n(__ptr, __size);
276 std::allocator<_Value>().deallocate(__ptr, __size);
277 };
278
279 // TODO: use __uninitialized_buffer
280 unique_ptr<_Value[], decltype(__destroy)> __values(std::allocator<_Value>().allocate(__size), __destroy);
281
282 // Initialize all elements to a moved-from state
283 // TODO: Don't do this - this can be done in the first merge - see https://llvm.org/PR63928
284 std::__construct_at(__values.get(), std::move(*__first));
285 for (__iter_diff_t<_RandomAccessIterator> __i = 1; __i != __size; ++__i) {
286 std::__construct_at(__values.get() + __i, std::move(__values.get()[__i - 1]));
287 }
288 *__first = std::move(__values.get()[__size - 1]);
289
290 __libdispatch::__dispatch_parallel_for(
291 __partitions,
292 __first,
293 [&__leaf_sort, &__comp](_RandomAccessIterator __chunk_first, _RandomAccessIterator __chunk_last) {
294 __leaf_sort(std::move(__chunk_first), std::move(__chunk_last), __comp);
295 });
296
297 bool __objects_are_in_buffer = false;
298 do {
299 const auto __old_chunk_size = __partitions.__chunk_size_;
300 if (__partitions.__chunk_count_ % 2 == 1) {
301 auto __inplace_merge_chunks = [&__comp, &__partitions](auto __first_chunk_begin) {
302 std::inplace_merge(
303 __first_chunk_begin,
304 __first_chunk_begin + __partitions.__first_chunk_size_,
305 __first_chunk_begin + __partitions.__first_chunk_size_ + __partitions.__chunk_size_,
306 __comp);
307 };
308 if (__objects_are_in_buffer)
309 __inplace_merge_chunks(__values.get());
310 else
311 __inplace_merge_chunks(__first);
312 __partitions.__first_chunk_size_ += 2 * __partitions.__chunk_size_;
313 } else {
314 __partitions.__first_chunk_size_ += __partitions.__chunk_size_;
315 }
316
317 __partitions.__chunk_size_ *= 2;
318 __partitions.__chunk_count_ /= 2;
319
320 auto __merge_chunks = [__partitions, __old_chunk_size, &__comp](auto __from_first, auto __to_first) {
321 __libdispatch::__dispatch_parallel_for(
322 __partitions,
323 __from_first,
324 [__old_chunk_size, &__from_first, &__to_first, &__comp](auto __chunk_first, auto __chunk_last) {
325 std::merge(std::make_move_iterator(__chunk_first),
326 std::make_move_iterator(__chunk_last - __old_chunk_size),
327 std::make_move_iterator(__chunk_last - __old_chunk_size),
328 std::make_move_iterator(__chunk_last),
329 __to_first + (__chunk_first - __from_first),
330 __comp);
331 });
332 };
333
334 if (__objects_are_in_buffer)
335 __merge_chunks(__values.get(), __first);
336 else
337 __merge_chunks(__first, __values.get());
338 __objects_are_in_buffer = !__objects_are_in_buffer;
339 } while (__partitions.__chunk_count_ > 1);
340
341 if (__objects_are_in_buffer) {
342 std::move(__values.get(), __values.get() + __size, __first);
343 }
344
345 return __empty{};
346 }
347
348 _LIBCPP_HIDE_FROM_ABI static void __cancel_execution() {}
349
350 static constexpr size_t __lane_size = 64;
351};
352
353// Mandatory implementations of the computational basis
354template <class _ExecutionPolicy>
355struct __find_if<__libdispatch_backend_tag, _ExecutionPolicy>
356 : __cpu_parallel_find_if<__libdispatch_backend_tag, _ExecutionPolicy> {};
357
358template <class _ExecutionPolicy>
359struct __for_each<__libdispatch_backend_tag, _ExecutionPolicy>
360 : __cpu_parallel_for_each<__libdispatch_backend_tag, _ExecutionPolicy> {};
361
362template <class _ExecutionPolicy>
363struct __merge<__libdispatch_backend_tag, _ExecutionPolicy>
364 : __cpu_parallel_merge<__libdispatch_backend_tag, _ExecutionPolicy> {};
365
366template <class _ExecutionPolicy>
367struct __stable_sort<__libdispatch_backend_tag, _ExecutionPolicy>
368 : __cpu_parallel_stable_sort<__libdispatch_backend_tag, _ExecutionPolicy> {};
369
370template <class _ExecutionPolicy>
371struct __transform<__libdispatch_backend_tag, _ExecutionPolicy>
372 : __cpu_parallel_transform<__libdispatch_backend_tag, _ExecutionPolicy> {};
373
374template <class _ExecutionPolicy>
375struct __transform_binary<__libdispatch_backend_tag, _ExecutionPolicy>
376 : __cpu_parallel_transform_binary<__libdispatch_backend_tag, _ExecutionPolicy> {};
377
378template <class _ExecutionPolicy>
379struct __transform_reduce<__libdispatch_backend_tag, _ExecutionPolicy>
380 : __cpu_parallel_transform_reduce<__libdispatch_backend_tag, _ExecutionPolicy> {};
381
382template <class _ExecutionPolicy>
383struct __transform_reduce_binary<__libdispatch_backend_tag, _ExecutionPolicy>
384 : __cpu_parallel_transform_reduce_binary<__libdispatch_backend_tag, _ExecutionPolicy> {};
385
386// Not mandatory, but better optimized
387template <class _ExecutionPolicy>
388struct __any_of<__libdispatch_backend_tag, _ExecutionPolicy>
389 : __cpu_parallel_any_of<__libdispatch_backend_tag, _ExecutionPolicy> {};
390
391template <class _ExecutionPolicy>
392struct __fill<__libdispatch_backend_tag, _ExecutionPolicy>
393 : __cpu_parallel_fill<__libdispatch_backend_tag, _ExecutionPolicy> {};
394
395} // namespace __pstl
396_LIBCPP_END_NAMESPACE_STD
397
398#endif // _LIBCPP_STD_VER >= 17
399
400_LIBCPP_POP_MACROS
401
402#endif // _LIBCPP___PSTL_BACKENDS_LIBDISPATCH_H
403

Warning: This file is not a C or C++ file. It does not have highlighting.

source code of libcxx/include/__pstl/backends/libdispatch.h