Warning: This file is not a C or C++ file. It does not have highlighting.
1 | //===----------------------------------------------------------------------===// |
---|---|
2 | // |
3 | // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. |
4 | // See https://llvm.org/LICENSE.txt for license information. |
5 | // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception |
6 | // |
7 | //===----------------------------------------------------------------------===// |
8 | |
9 | #ifndef _LIBCPP___PSTL_BACKENDS_LIBDISPATCH_H |
10 | #define _LIBCPP___PSTL_BACKENDS_LIBDISPATCH_H |
11 | |
12 | #include <__algorithm/inplace_merge.h> |
13 | #include <__algorithm/lower_bound.h> |
14 | #include <__algorithm/max.h> |
15 | #include <__algorithm/merge.h> |
16 | #include <__algorithm/upper_bound.h> |
17 | #include <__atomic/atomic.h> |
18 | #include <__config> |
19 | #include <__cstddef/ptrdiff_t.h> |
20 | #include <__exception/terminate.h> |
21 | #include <__iterator/iterator_traits.h> |
22 | #include <__iterator/move_iterator.h> |
23 | #include <__memory/allocator.h> |
24 | #include <__memory/construct_at.h> |
25 | #include <__memory/destroy.h> |
26 | #include <__memory/unique_ptr.h> |
27 | #include <__new/exceptions.h> |
28 | #include <__numeric/reduce.h> |
29 | #include <__pstl/backend_fwd.h> |
30 | #include <__pstl/cpu_algos/any_of.h> |
31 | #include <__pstl/cpu_algos/cpu_traits.h> |
32 | #include <__pstl/cpu_algos/fill.h> |
33 | #include <__pstl/cpu_algos/find_if.h> |
34 | #include <__pstl/cpu_algos/for_each.h> |
35 | #include <__pstl/cpu_algos/merge.h> |
36 | #include <__pstl/cpu_algos/stable_sort.h> |
37 | #include <__pstl/cpu_algos/transform.h> |
38 | #include <__pstl/cpu_algos/transform_reduce.h> |
39 | #include <__utility/empty.h> |
40 | #include <__utility/exception_guard.h> |
41 | #include <__utility/move.h> |
42 | #include <__utility/pair.h> |
43 | #include <optional> |
44 | |
45 | _LIBCPP_PUSH_MACROS |
46 | #include <__undef_macros> |
47 | |
48 | #if _LIBCPP_STD_VER >= 17 |
49 | |
50 | _LIBCPP_BEGIN_NAMESPACE_STD |
51 | namespace __pstl { |
52 | |
53 | namespace __libdispatch { |
54 | // ::dispatch_apply is marked as __attribute__((nothrow)) because it doesn't let exceptions propagate, and neither do |
55 | // we. |
56 | // TODO: Do we want to add [[_Clang::__callback__(__func, __context, __)]]? |
57 | _LIBCPP_EXPORTED_FROM_ABI void |
58 | __dispatch_apply(size_t __chunk_count, void* __context, void (*__func)(void* __context, size_t __chunk)) noexcept; |
59 | |
60 | template <class _Func> |
61 | _LIBCPP_HIDE_FROM_ABI void __dispatch_apply(size_t __chunk_count, _Func __func) noexcept { |
62 | __libdispatch::__dispatch_apply(__chunk_count, &__func, [](void* __context, size_t __chunk) { |
63 | (*static_cast<_Func*>(__context))(__chunk); |
64 | }); |
65 | } |
66 | |
67 | struct __chunk_partitions { |
68 | ptrdiff_t __chunk_count_; // includes the first chunk |
69 | ptrdiff_t __chunk_size_; |
70 | ptrdiff_t __first_chunk_size_; |
71 | }; |
72 | |
73 | [[__gnu__::__const__]] _LIBCPP_EXPORTED_FROM_ABI __chunk_partitions __partition_chunks(ptrdiff_t __size) noexcept; |
74 | |
75 | template <class _RandomAccessIterator, class _Functor> |
76 | _LIBCPP_HIDE_FROM_ABI optional<__empty> |
77 | __dispatch_parallel_for(__chunk_partitions __partitions, _RandomAccessIterator __first, _Functor __func) { |
78 | // Perform the chunked execution. |
79 | __libdispatch::__dispatch_apply(__partitions.__chunk_count_, [&](size_t __chunk) { |
80 | auto __this_chunk_size = __chunk == 0 ? __partitions.__first_chunk_size_ : __partitions.__chunk_size_; |
81 | auto __index = |
82 | __chunk == 0 |
83 | ? 0 |
84 | : (__chunk * __partitions.__chunk_size_) + (__partitions.__first_chunk_size_ - __partitions.__chunk_size_); |
85 | __func(__first + __index, __first + __index + __this_chunk_size); |
86 | }); |
87 | |
88 | return __empty{}; |
89 | } |
90 | } // namespace __libdispatch |
91 | |
92 | template <> |
93 | struct __cpu_traits<__libdispatch_backend_tag> { |
94 | template <class _RandomAccessIterator, class _Functor> |
95 | _LIBCPP_HIDE_FROM_ABI static optional<__empty> |
96 | __for_each(_RandomAccessIterator __first, _RandomAccessIterator __last, _Functor __func) { |
97 | return __libdispatch::__dispatch_parallel_for( |
98 | __libdispatch::__partition_chunks(__last - __first), std::move(__first), std::move(__func)); |
99 | } |
100 | |
101 | template <class _RandomAccessIterator1, class _RandomAccessIterator2, class _RandomAccessIteratorOut> |
102 | struct __merge_range { |
103 | __merge_range(_RandomAccessIterator1 __mid1, _RandomAccessIterator2 __mid2, _RandomAccessIteratorOut __result) |
104 | : __mid1_(__mid1), __mid2_(__mid2), __result_(__result) {} |
105 | |
106 | _RandomAccessIterator1 __mid1_; |
107 | _RandomAccessIterator2 __mid2_; |
108 | _RandomAccessIteratorOut __result_; |
109 | }; |
110 | |
111 | template <typename _RandomAccessIterator1, |
112 | typename _RandomAccessIterator2, |
113 | typename _RandomAccessIterator3, |
114 | typename _Compare, |
115 | typename _LeafMerge> |
116 | _LIBCPP_HIDE_FROM_ABI static optional<__empty> |
117 | __merge(_RandomAccessIterator1 __first1, |
118 | _RandomAccessIterator1 __last1, |
119 | _RandomAccessIterator2 __first2, |
120 | _RandomAccessIterator2 __last2, |
121 | _RandomAccessIterator3 __result, |
122 | _Compare __comp, |
123 | _LeafMerge __leaf_merge) noexcept { |
124 | __libdispatch::__chunk_partitions __partitions = |
125 | __libdispatch::__partition_chunks(std::max<ptrdiff_t>(__last1 - __first1, __last2 - __first2)); |
126 | |
127 | if (__partitions.__chunk_count_ == 0) |
128 | return __empty{}; |
129 | |
130 | if (__partitions.__chunk_count_ == 1) { |
131 | __leaf_merge(__first1, __last1, __first2, __last2, __result, __comp); |
132 | return __empty{}; |
133 | } |
134 | |
135 | using __merge_range_t = __merge_range<_RandomAccessIterator1, _RandomAccessIterator2, _RandomAccessIterator3>; |
136 | auto const __n_ranges = __partitions.__chunk_count_ + 1; |
137 | |
138 | // TODO: use __uninitialized_buffer |
139 | auto __destroy = [=](__merge_range_t* __ptr) { |
140 | std::destroy_n(__ptr, __n_ranges); |
141 | std::allocator<__merge_range_t>().deallocate(__ptr, __n_ranges); |
142 | }; |
143 | |
144 | unique_ptr<__merge_range_t[], decltype(__destroy)> __ranges( |
145 | [&]() -> __merge_range_t* { |
146 | # if _LIBCPP_HAS_EXCEPTIONS |
147 | try { |
148 | # endif |
149 | return std::allocator<__merge_range_t>().allocate(__n_ranges); |
150 | # if _LIBCPP_HAS_EXCEPTIONS |
151 | } catch (const std::bad_alloc&) { |
152 | return nullptr; |
153 | } |
154 | # endif |
155 | }(), |
156 | __destroy); |
157 | |
158 | if (!__ranges) |
159 | return nullopt; |
160 | |
161 | // TODO: Improve the case where the smaller range is merged into just a few (or even one) chunks of the larger case |
162 | __merge_range_t* __r = __ranges.get(); |
163 | std::__construct_at(__r++, __first1, __first2, __result); |
164 | |
165 | bool __iterate_first_range = __last1 - __first1 > __last2 - __first2; |
166 | |
167 | auto __compute_chunk = [&](size_t __chunk_size) -> __merge_range_t { |
168 | auto [__mid1, __mid2] = [&] { |
169 | if (__iterate_first_range) { |
170 | auto __m1 = __first1 + __chunk_size; |
171 | auto __m2 = std::lower_bound(__first2, __last2, __m1[-1], __comp); |
172 | return std::make_pair(__m1, __m2); |
173 | } else { |
174 | auto __m2 = __first2 + __chunk_size; |
175 | auto __m1 = std::lower_bound(__first1, __last1, __m2[-1], __comp); |
176 | return std::make_pair(__m1, __m2); |
177 | } |
178 | }(); |
179 | |
180 | __result += (__mid1 - __first1) + (__mid2 - __first2); |
181 | __first1 = __mid1; |
182 | __first2 = __mid2; |
183 | return {std::move(__mid1), std::move(__mid2), __result}; |
184 | }; |
185 | |
186 | // handle first chunk |
187 | std::__construct_at(__r++, __compute_chunk(__partitions.__first_chunk_size_)); |
188 | |
189 | // handle 2 -> N - 1 chunks |
190 | for (ptrdiff_t __i = 0; __i != __partitions.__chunk_count_ - 2; ++__i) |
191 | std::__construct_at(__r++, __compute_chunk(__partitions.__chunk_size_)); |
192 | |
193 | // handle last chunk |
194 | std::__construct_at(__r, __last1, __last2, __result); |
195 | |
196 | __libdispatch::__dispatch_apply(__partitions.__chunk_count_, [&](size_t __index) { |
197 | auto __first_iters = __ranges[__index]; |
198 | auto __last_iters = __ranges[__index + 1]; |
199 | __leaf_merge( |
200 | __first_iters.__mid1_, |
201 | __last_iters.__mid1_, |
202 | __first_iters.__mid2_, |
203 | __last_iters.__mid2_, |
204 | __first_iters.__result_, |
205 | __comp); |
206 | }); |
207 | |
208 | return __empty{}; |
209 | } |
210 | |
211 | template <class _RandomAccessIterator, class _Transform, class _Value, class _Combiner, class _Reduction> |
212 | _LIBCPP_HIDE_FROM_ABI static optional<_Value> __transform_reduce( |
213 | _RandomAccessIterator __first, |
214 | _RandomAccessIterator __last, |
215 | _Transform __transform, |
216 | _Value __init, |
217 | _Combiner __combiner, |
218 | _Reduction __reduction) { |
219 | if (__first == __last) |
220 | return __init; |
221 | |
222 | auto __partitions = __libdispatch::__partition_chunks(__last - __first); |
223 | |
224 | auto __destroy = [__count = __partitions.__chunk_count_](_Value* __ptr) { |
225 | std::destroy_n(__ptr, __count); |
226 | std::allocator<_Value>().deallocate(__ptr, __count); |
227 | }; |
228 | |
229 | // TODO: use __uninitialized_buffer |
230 | // TODO: allocate one element per worker instead of one element per chunk |
231 | unique_ptr<_Value[], decltype(__destroy)> __values( |
232 | std::allocator<_Value>().allocate(__partitions.__chunk_count_), __destroy); |
233 | |
234 | // __dispatch_apply is noexcept |
235 | __libdispatch::__dispatch_apply(__partitions.__chunk_count_, [&](size_t __chunk) { |
236 | auto __this_chunk_size = __chunk == 0 ? __partitions.__first_chunk_size_ : __partitions.__chunk_size_; |
237 | auto __index = __chunk == 0 ? 0 |
238 | : (__chunk * __partitions.__chunk_size_) + |
239 | (__partitions.__first_chunk_size_ - __partitions.__chunk_size_); |
240 | if (__this_chunk_size != 1) { |
241 | std::__construct_at( |
242 | __values.get() + __chunk, |
243 | __reduction(__first + __index + 2, |
244 | __first + __index + __this_chunk_size, |
245 | __combiner(__transform(__first + __index), __transform(__first + __index + 1)))); |
246 | } else { |
247 | std::__construct_at(__values.get() + __chunk, __transform(__first + __index)); |
248 | } |
249 | }); |
250 | |
251 | return std::reduce( |
252 | std::make_move_iterator(__values.get()), |
253 | std::make_move_iterator(__values.get() + __partitions.__chunk_count_), |
254 | std::move(__init), |
255 | __combiner); |
256 | } |
257 | |
258 | template <class _RandomAccessIterator, class _Comp, class _LeafSort> |
259 | _LIBCPP_HIDE_FROM_ABI static optional<__empty> |
260 | __stable_sort(_RandomAccessIterator __first, _RandomAccessIterator __last, _Comp __comp, _LeafSort __leaf_sort) { |
261 | const auto __size = __last - __first; |
262 | auto __partitions = __libdispatch::__partition_chunks(__size); |
263 | |
264 | if (__partitions.__chunk_count_ == 0) |
265 | return __empty{}; |
266 | |
267 | if (__partitions.__chunk_count_ == 1) { |
268 | __leaf_sort(__first, __last, __comp); |
269 | return __empty{}; |
270 | } |
271 | |
272 | using _Value = __iter_value_type<_RandomAccessIterator>; |
273 | |
274 | auto __destroy = [__size](_Value* __ptr) { |
275 | std::destroy_n(__ptr, __size); |
276 | std::allocator<_Value>().deallocate(__ptr, __size); |
277 | }; |
278 | |
279 | // TODO: use __uninitialized_buffer |
280 | unique_ptr<_Value[], decltype(__destroy)> __values(std::allocator<_Value>().allocate(__size), __destroy); |
281 | |
282 | // Initialize all elements to a moved-from state |
283 | // TODO: Don't do this - this can be done in the first merge - see https://llvm.org/PR63928 |
284 | std::__construct_at(__values.get(), std::move(*__first)); |
285 | for (__iter_diff_t<_RandomAccessIterator> __i = 1; __i != __size; ++__i) { |
286 | std::__construct_at(__values.get() + __i, std::move(__values.get()[__i - 1])); |
287 | } |
288 | *__first = std::move(__values.get()[__size - 1]); |
289 | |
290 | __libdispatch::__dispatch_parallel_for( |
291 | __partitions, |
292 | __first, |
293 | [&__leaf_sort, &__comp](_RandomAccessIterator __chunk_first, _RandomAccessIterator __chunk_last) { |
294 | __leaf_sort(std::move(__chunk_first), std::move(__chunk_last), __comp); |
295 | }); |
296 | |
297 | bool __objects_are_in_buffer = false; |
298 | do { |
299 | const auto __old_chunk_size = __partitions.__chunk_size_; |
300 | if (__partitions.__chunk_count_ % 2 == 1) { |
301 | auto __inplace_merge_chunks = [&__comp, &__partitions](auto __first_chunk_begin) { |
302 | std::inplace_merge( |
303 | __first_chunk_begin, |
304 | __first_chunk_begin + __partitions.__first_chunk_size_, |
305 | __first_chunk_begin + __partitions.__first_chunk_size_ + __partitions.__chunk_size_, |
306 | __comp); |
307 | }; |
308 | if (__objects_are_in_buffer) |
309 | __inplace_merge_chunks(__values.get()); |
310 | else |
311 | __inplace_merge_chunks(__first); |
312 | __partitions.__first_chunk_size_ += 2 * __partitions.__chunk_size_; |
313 | } else { |
314 | __partitions.__first_chunk_size_ += __partitions.__chunk_size_; |
315 | } |
316 | |
317 | __partitions.__chunk_size_ *= 2; |
318 | __partitions.__chunk_count_ /= 2; |
319 | |
320 | auto __merge_chunks = [__partitions, __old_chunk_size, &__comp](auto __from_first, auto __to_first) { |
321 | __libdispatch::__dispatch_parallel_for( |
322 | __partitions, |
323 | __from_first, |
324 | [__old_chunk_size, &__from_first, &__to_first, &__comp](auto __chunk_first, auto __chunk_last) { |
325 | std::merge(std::make_move_iterator(__chunk_first), |
326 | std::make_move_iterator(__chunk_last - __old_chunk_size), |
327 | std::make_move_iterator(__chunk_last - __old_chunk_size), |
328 | std::make_move_iterator(__chunk_last), |
329 | __to_first + (__chunk_first - __from_first), |
330 | __comp); |
331 | }); |
332 | }; |
333 | |
334 | if (__objects_are_in_buffer) |
335 | __merge_chunks(__values.get(), __first); |
336 | else |
337 | __merge_chunks(__first, __values.get()); |
338 | __objects_are_in_buffer = !__objects_are_in_buffer; |
339 | } while (__partitions.__chunk_count_ > 1); |
340 | |
341 | if (__objects_are_in_buffer) { |
342 | std::move(__values.get(), __values.get() + __size, __first); |
343 | } |
344 | |
345 | return __empty{}; |
346 | } |
347 | |
348 | _LIBCPP_HIDE_FROM_ABI static void __cancel_execution() {} |
349 | |
350 | static constexpr size_t __lane_size = 64; |
351 | }; |
352 | |
353 | // Mandatory implementations of the computational basis |
354 | template <class _ExecutionPolicy> |
355 | struct __find_if<__libdispatch_backend_tag, _ExecutionPolicy> |
356 | : __cpu_parallel_find_if<__libdispatch_backend_tag, _ExecutionPolicy> {}; |
357 | |
358 | template <class _ExecutionPolicy> |
359 | struct __for_each<__libdispatch_backend_tag, _ExecutionPolicy> |
360 | : __cpu_parallel_for_each<__libdispatch_backend_tag, _ExecutionPolicy> {}; |
361 | |
362 | template <class _ExecutionPolicy> |
363 | struct __merge<__libdispatch_backend_tag, _ExecutionPolicy> |
364 | : __cpu_parallel_merge<__libdispatch_backend_tag, _ExecutionPolicy> {}; |
365 | |
366 | template <class _ExecutionPolicy> |
367 | struct __stable_sort<__libdispatch_backend_tag, _ExecutionPolicy> |
368 | : __cpu_parallel_stable_sort<__libdispatch_backend_tag, _ExecutionPolicy> {}; |
369 | |
370 | template <class _ExecutionPolicy> |
371 | struct __transform<__libdispatch_backend_tag, _ExecutionPolicy> |
372 | : __cpu_parallel_transform<__libdispatch_backend_tag, _ExecutionPolicy> {}; |
373 | |
374 | template <class _ExecutionPolicy> |
375 | struct __transform_binary<__libdispatch_backend_tag, _ExecutionPolicy> |
376 | : __cpu_parallel_transform_binary<__libdispatch_backend_tag, _ExecutionPolicy> {}; |
377 | |
378 | template <class _ExecutionPolicy> |
379 | struct __transform_reduce<__libdispatch_backend_tag, _ExecutionPolicy> |
380 | : __cpu_parallel_transform_reduce<__libdispatch_backend_tag, _ExecutionPolicy> {}; |
381 | |
382 | template <class _ExecutionPolicy> |
383 | struct __transform_reduce_binary<__libdispatch_backend_tag, _ExecutionPolicy> |
384 | : __cpu_parallel_transform_reduce_binary<__libdispatch_backend_tag, _ExecutionPolicy> {}; |
385 | |
386 | // Not mandatory, but better optimized |
387 | template <class _ExecutionPolicy> |
388 | struct __any_of<__libdispatch_backend_tag, _ExecutionPolicy> |
389 | : __cpu_parallel_any_of<__libdispatch_backend_tag, _ExecutionPolicy> {}; |
390 | |
391 | template <class _ExecutionPolicy> |
392 | struct __fill<__libdispatch_backend_tag, _ExecutionPolicy> |
393 | : __cpu_parallel_fill<__libdispatch_backend_tag, _ExecutionPolicy> {}; |
394 | |
395 | } // namespace __pstl |
396 | _LIBCPP_END_NAMESPACE_STD |
397 | |
398 | #endif // _LIBCPP_STD_VER >= 17 |
399 | |
400 | _LIBCPP_POP_MACROS |
401 | |
402 | #endif // _LIBCPP___PSTL_BACKENDS_LIBDISPATCH_H |
403 |
Warning: This file is not a C or C++ file. It does not have highlighting.