1/*
2 Copyright (c) 2005-2023 Intel Corporation
3
4 Licensed under the Apache License, Version 2.0 (the "License");
5 you may not use this file except in compliance with the License.
6 You may obtain a copy of the License at
7
8 http://www.apache.org/licenses/LICENSE-2.0
9
10 Unless required by applicable law or agreed to in writing, software
11 distributed under the License is distributed on an "AS IS" BASIS,
12 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 See the License for the specific language governing permissions and
14 limitations under the License.
15*/
16
17#ifndef __TBB_parallel_for_H
18#define __TBB_parallel_for_H
19
20#include "detail/_config.h"
21#include "detail/_namespace_injection.h"
22#include "detail/_exception.h"
23#include "detail/_task.h"
24#include "detail/_small_object_pool.h"
25#include "profiling.h"
26
27#include "partitioner.h"
28#include "blocked_range.h"
29#include "task_group.h"
30
31#include <cstddef>
32#include <new>
33
34namespace tbb {
35namespace detail {
36#if __TBB_CPP20_CONCEPTS_PRESENT
37inline namespace d0 {
38
39template <typename Body, typename Range>
40concept parallel_for_body = std::copy_constructible<Body> && std::invocable<const std::remove_reference_t<Body>&, Range&>;
41
42template <typename Index>
43concept parallel_for_index = std::constructible_from<Index, int> &&
44 std::copyable<Index> &&
45 requires( const std::remove_reference_t<Index>& lhs, const std::remove_reference_t<Index>& rhs ) {
46 { lhs < rhs } -> adaptive_same_as<bool>;
47 { lhs - rhs } -> std::convertible_to<std::size_t>;
48 { lhs + (rhs - lhs) } -> std::convertible_to<Index>;
49 };
50
51template <typename Function, typename Index>
52concept parallel_for_function = std::invocable<const std::remove_reference_t<Function>&, Index>;
53
54} // namespace d0
55#endif // __TBB_CPP20_CONCEPTS_PRESENT
56namespace d1 {
57
58//! Task type used in parallel_for
59/** @ingroup algorithms */
60template<typename Range, typename Body, typename Partitioner>
61struct start_for : public task {
62 Range my_range;
63 const Body my_body;
64 node* my_parent;
65
66 typename Partitioner::task_partition_type my_partition;
67 small_object_allocator my_allocator;
68
69 task* execute(execution_data&) override;
70 task* cancel(execution_data&) override;
71 void finalize(const execution_data&);
72
73 //! Constructor for root task.
74 start_for( const Range& range, const Body& body, Partitioner& partitioner, small_object_allocator& alloc ) :
75 my_range(range),
76 my_body(body),
77 my_parent(nullptr),
78 my_partition(partitioner),
79 my_allocator(alloc) {}
80 //! Splitting constructor used to generate children.
81 /** parent_ becomes left child. Newly constructed object is right child. */
82 start_for( start_for& parent_, typename Partitioner::split_type& split_obj, small_object_allocator& alloc ) :
83 my_range(parent_.my_range, get_range_split_object<Range>(split_obj)),
84 my_body(parent_.my_body),
85 my_parent(nullptr),
86 my_partition(parent_.my_partition, split_obj),
87 my_allocator(alloc) {}
88 //! Construct right child from the given range as response to the demand.
89 /** parent_ remains left child. Newly constructed object is right child. */
90 start_for( start_for& parent_, const Range& r, depth_t d, small_object_allocator& alloc ) :
91 my_range(r),
92 my_body(parent_.my_body),
93 my_parent(nullptr),
94 my_partition(parent_.my_partition, split()),
95 my_allocator(alloc)
96 {
97 my_partition.align_depth( d );
98 }
99 static void run(const Range& range, const Body& body, Partitioner& partitioner) {
100 task_group_context context(PARALLEL_FOR);
101 run(range, body, partitioner, context);
102 }
103
104 static void run(const Range& range, const Body& body, Partitioner& partitioner, task_group_context& context) {
105 if ( !range.empty() ) {
106 small_object_allocator alloc{};
107 start_for& for_task = *alloc.new_object<start_for>(range, body, partitioner, alloc);
108
109 // defer creation of the wait node until task allocation succeeds
110 wait_node wn;
111 for_task.my_parent = &wn;
112 execute_and_wait(for_task, context, wn.m_wait, context);
113 }
114 }
115 //! Run body for range, serves as callback for partitioner
116 void run_body( Range &r ) {
117 tbb::detail::invoke(my_body, r);
118 }
119
120 //! spawn right task, serves as callback for partitioner
121 void offer_work(typename Partitioner::split_type& split_obj, execution_data& ed) {
122 offer_work_impl(ed, *this, split_obj);
123 }
124
125 //! spawn right task, serves as callback for partitioner
126 void offer_work(const Range& r, depth_t d, execution_data& ed) {
127 offer_work_impl(ed, *this, r, d);
128 }
129
130private:
131 template <typename... Args>
132 void offer_work_impl(execution_data& ed, Args&&... constructor_args) {
133 // New right child
134 small_object_allocator alloc{};
135 start_for& right_child = *alloc.new_object<start_for>(ed, std::forward<Args>(constructor_args)..., alloc);
136
137 // New root node as a continuation and ref count. Left and right child attach to the new parent.
138 right_child.my_parent = my_parent = alloc.new_object<tree_node>(ed, args&: my_parent, args: 2, args&: alloc);
139 // Spawn the right sibling
140 right_child.spawn_self(ed);
141 }
142
143 void spawn_self(execution_data& ed) {
144 my_partition.spawn_task(*this, *context(ed));
145 }
146};
147
148//! fold the tree and deallocate the task
149template<typename Range, typename Body, typename Partitioner>
150void start_for<Range, Body, Partitioner>::finalize(const execution_data& ed) {
151 // Get the current parent and allocator an object destruction
152 node* parent = my_parent;
153 auto allocator = my_allocator;
154 // Task execution finished - destroy it
155 this->~start_for();
156 // Unwind the tree decrementing the parent`s reference count
157
158 fold_tree<tree_node>(n: parent, ed);
159 allocator.deallocate(this, ed);
160
161}
162
163//! execute task for parallel_for
164template<typename Range, typename Body, typename Partitioner>
165task* start_for<Range, Body, Partitioner>::execute(execution_data& ed) {
166 if (!is_same_affinity(ed)) {
167 my_partition.note_affinity(execution_slot(ed));
168 }
169 my_partition.check_being_stolen(*this, ed);
170 my_partition.execute(*this, my_range, ed);
171 finalize(ed);
172 return nullptr;
173}
174
175//! cancel task for parallel_for
176template<typename Range, typename Body, typename Partitioner>
177task* start_for<Range, Body, Partitioner>::cancel(execution_data& ed) {
178 finalize(ed);
179 return nullptr;
180}
181
182//! Calls the function with values from range [begin, end) with a step provided
183template<typename Function, typename Index>
184class parallel_for_body_wrapper : detail::no_assign {
185 const Function &my_func;
186 const Index my_begin;
187 const Index my_step;
188public:
189 parallel_for_body_wrapper( const Function& _func, Index& _begin, Index& _step )
190 : my_func(_func), my_begin(_begin), my_step(_step) {}
191
192 void operator()( const blocked_range<Index>& r ) const {
193 // A set of local variables to help the compiler with vectorization of the following loop.
194 Index b = r.begin();
195 Index e = r.end();
196 Index ms = my_step;
197 Index k = my_begin + b*ms;
198
199#if __INTEL_COMPILER
200#pragma ivdep
201#if __TBB_ASSERT_ON_VECTORIZATION_FAILURE
202#pragma vector always assert
203#endif
204#endif
205 for ( Index i = b; i < e; ++i, k += ms ) {
206 tbb::detail::invoke(my_func, k);
207 }
208 }
209};
210
211// Requirements on Range concept are documented in blocked_range.h
212
213/** \page parallel_for_body_req Requirements on parallel_for body
214 Class \c Body implementing the concept of parallel_for body must define:
215 - \code Body::Body( const Body& ); \endcode Copy constructor
216 - \code Body::~Body(); \endcode Destructor
217 - \code void Body::operator()( Range& r ) const; \endcode Function call operator applying the body to range \c r.
218**/
219
220/** \name parallel_for
221 See also requirements on \ref range_req "Range" and \ref parallel_for_body_req "parallel_for Body". **/
222//@{
223
224//! Parallel iteration over range with default partitioner.
225/** @ingroup algorithms **/
226template<typename Range, typename Body>
227 __TBB_requires(tbb_range<Range> && parallel_for_body<Body, Range>)
228void parallel_for( const Range& range, const Body& body ) {
229 start_for<Range,Body,const __TBB_DEFAULT_PARTITIONER>::run(range,body,__TBB_DEFAULT_PARTITIONER());
230}
231
232//! Parallel iteration over range with simple partitioner.
233/** @ingroup algorithms **/
234template<typename Range, typename Body>
235 __TBB_requires(tbb_range<Range> && parallel_for_body<Body, Range>)
236void parallel_for( const Range& range, const Body& body, const simple_partitioner& partitioner ) {
237 start_for<Range,Body,const simple_partitioner>::run(range,body,partitioner);
238}
239
240//! Parallel iteration over range with auto_partitioner.
241/** @ingroup algorithms **/
242template<typename Range, typename Body>
243 __TBB_requires(tbb_range<Range> && parallel_for_body<Body, Range>)
244void parallel_for( const Range& range, const Body& body, const auto_partitioner& partitioner ) {
245 start_for<Range,Body,const auto_partitioner>::run(range,body,partitioner);
246}
247
248//! Parallel iteration over range with static_partitioner.
249/** @ingroup algorithms **/
250template<typename Range, typename Body>
251 __TBB_requires(tbb_range<Range> && parallel_for_body<Body, Range>)
252void parallel_for( const Range& range, const Body& body, const static_partitioner& partitioner ) {
253 start_for<Range,Body,const static_partitioner>::run(range,body,partitioner);
254}
255
256//! Parallel iteration over range with affinity_partitioner.
257/** @ingroup algorithms **/
258template<typename Range, typename Body>
259 __TBB_requires(tbb_range<Range> && parallel_for_body<Body, Range>)
260void parallel_for( const Range& range, const Body& body, affinity_partitioner& partitioner ) {
261 start_for<Range,Body,affinity_partitioner>::run(range,body,partitioner);
262}
263
264//! Parallel iteration over range with default partitioner and user-supplied context.
265/** @ingroup algorithms **/
266template<typename Range, typename Body>
267 __TBB_requires(tbb_range<Range> && parallel_for_body<Body, Range>)
268void parallel_for( const Range& range, const Body& body, task_group_context& context ) {
269 start_for<Range,Body,const __TBB_DEFAULT_PARTITIONER>::run(range, body, __TBB_DEFAULT_PARTITIONER(), context);
270}
271
272//! Parallel iteration over range with simple partitioner and user-supplied context.
273/** @ingroup algorithms **/
274template<typename Range, typename Body>
275 __TBB_requires(tbb_range<Range> && parallel_for_body<Body, Range>)
276void parallel_for( const Range& range, const Body& body, const simple_partitioner& partitioner, task_group_context& context ) {
277 start_for<Range,Body,const simple_partitioner>::run(range, body, partitioner, context);
278}
279
280//! Parallel iteration over range with auto_partitioner and user-supplied context.
281/** @ingroup algorithms **/
282template<typename Range, typename Body>
283 __TBB_requires(tbb_range<Range> && parallel_for_body<Body, Range>)
284void parallel_for( const Range& range, const Body& body, const auto_partitioner& partitioner, task_group_context& context ) {
285 start_for<Range,Body,const auto_partitioner>::run(range, body, partitioner, context);
286}
287
288//! Parallel iteration over range with static_partitioner and user-supplied context.
289/** @ingroup algorithms **/
290template<typename Range, typename Body>
291 __TBB_requires(tbb_range<Range> && parallel_for_body<Body, Range>)
292void parallel_for( const Range& range, const Body& body, const static_partitioner& partitioner, task_group_context& context ) {
293 start_for<Range,Body,const static_partitioner>::run(range, body, partitioner, context);
294}
295
296//! Parallel iteration over range with affinity_partitioner and user-supplied context.
297/** @ingroup algorithms **/
298template<typename Range, typename Body>
299 __TBB_requires(tbb_range<Range> && parallel_for_body<Body, Range>)
300void parallel_for( const Range& range, const Body& body, affinity_partitioner& partitioner, task_group_context& context ) {
301 start_for<Range,Body,affinity_partitioner>::run(range,body,partitioner, context);
302}
303
304//! Implementation of parallel iteration over stepped range of integers with explicit step and partitioner
305template <typename Index, typename Function, typename Partitioner>
306void parallel_for_impl(Index first, Index last, Index step, const Function& f, Partitioner& partitioner) {
307 if (step <= 0 )
308 throw_exception(exception_id::nonpositive_step); // throws std::invalid_argument
309 else if (first < last) {
310 // Above "else" avoids "potential divide by zero" warning on some platforms
311 Index end = Index(last - first - 1ul) / step + Index(1);
312 blocked_range<Index> range(static_cast<Index>(0), end);
313 parallel_for_body_wrapper<Function, Index> body(f, first, step);
314 parallel_for(range, body, partitioner);
315 }
316}
317
318//! Parallel iteration over a range of integers with a step provided and default partitioner
319template <typename Index, typename Function>
320 __TBB_requires(parallel_for_index<Index> && parallel_for_function<Function, Index>)
321void parallel_for(Index first, Index last, Index step, const Function& f) {
322 parallel_for_impl<Index,Function,const auto_partitioner>(first, last, step, f, auto_partitioner());
323}
324//! Parallel iteration over a range of integers with a step provided and simple partitioner
325template <typename Index, typename Function>
326 __TBB_requires(parallel_for_index<Index> && parallel_for_function<Function, Index>)
327void parallel_for(Index first, Index last, Index step, const Function& f, const simple_partitioner& partitioner) {
328 parallel_for_impl<Index,Function,const simple_partitioner>(first, last, step, f, partitioner);
329}
330//! Parallel iteration over a range of integers with a step provided and auto partitioner
331template <typename Index, typename Function>
332 __TBB_requires(parallel_for_index<Index> && parallel_for_function<Function, Index>)
333void parallel_for(Index first, Index last, Index step, const Function& f, const auto_partitioner& partitioner) {
334 parallel_for_impl<Index,Function,const auto_partitioner>(first, last, step, f, partitioner);
335}
336//! Parallel iteration over a range of integers with a step provided and static partitioner
337template <typename Index, typename Function>
338 __TBB_requires(parallel_for_index<Index> && parallel_for_function<Function, Index>)
339void parallel_for(Index first, Index last, Index step, const Function& f, const static_partitioner& partitioner) {
340 parallel_for_impl<Index,Function,const static_partitioner>(first, last, step, f, partitioner);
341}
342//! Parallel iteration over a range of integers with a step provided and affinity partitioner
343template <typename Index, typename Function>
344 __TBB_requires(parallel_for_index<Index> && parallel_for_function<Function, Index>)
345void parallel_for(Index first, Index last, Index step, const Function& f, affinity_partitioner& partitioner) {
346 parallel_for_impl(first, last, step, f, partitioner);
347}
348
349//! Parallel iteration over a range of integers with a default step value and default partitioner
350template <typename Index, typename Function>
351 __TBB_requires(parallel_for_index<Index> && parallel_for_function<Function, Index>)
352void parallel_for(Index first, Index last, const Function& f) {
353 parallel_for_impl<Index,Function,const auto_partitioner>(first, last, static_cast<Index>(1), f, auto_partitioner());
354}
355//! Parallel iteration over a range of integers with a default step value and simple partitioner
356template <typename Index, typename Function>
357 __TBB_requires(parallel_for_index<Index> && parallel_for_function<Function, Index>)
358void parallel_for(Index first, Index last, const Function& f, const simple_partitioner& partitioner) {
359 parallel_for_impl<Index,Function,const simple_partitioner>(first, last, static_cast<Index>(1), f, partitioner);
360}
361//! Parallel iteration over a range of integers with a default step value and auto partitioner
362template <typename Index, typename Function>
363 __TBB_requires(parallel_for_index<Index> && parallel_for_function<Function, Index>)
364void parallel_for(Index first, Index last, const Function& f, const auto_partitioner& partitioner) {
365 parallel_for_impl<Index,Function,const auto_partitioner>(first, last, static_cast<Index>(1), f, partitioner);
366}
367//! Parallel iteration over a range of integers with a default step value and static partitioner
368template <typename Index, typename Function>
369 __TBB_requires(parallel_for_index<Index> && parallel_for_function<Function, Index>)
370void parallel_for(Index first, Index last, const Function& f, const static_partitioner& partitioner) {
371 parallel_for_impl<Index,Function,const static_partitioner>(first, last, static_cast<Index>(1), f, partitioner);
372}
373//! Parallel iteration over a range of integers with a default step value and affinity partitioner
374template <typename Index, typename Function>
375 __TBB_requires(parallel_for_index<Index> && parallel_for_function<Function, Index>)
376void parallel_for(Index first, Index last, const Function& f, affinity_partitioner& partitioner) {
377 parallel_for_impl(first, last, static_cast<Index>(1), f, partitioner);
378}
379
380//! Implementation of parallel iteration over stepped range of integers with explicit step, task group context, and partitioner
381template <typename Index, typename Function, typename Partitioner>
382void parallel_for_impl(Index first, Index last, Index step, const Function& f, Partitioner& partitioner, task_group_context &context) {
383 if (step <= 0 )
384 throw_exception(exception_id::nonpositive_step); // throws std::invalid_argument
385 else if (first < last) {
386 // Above "else" avoids "potential divide by zero" warning on some platforms
387 Index end = (last - first - Index(1)) / step + Index(1);
388 blocked_range<Index> range(static_cast<Index>(0), end);
389 parallel_for_body_wrapper<Function, Index> body(f, first, step);
390 parallel_for(range, body, partitioner, context);
391 }
392}
393
394//! Parallel iteration over a range of integers with explicit step, task group context, and default partitioner
395template <typename Index, typename Function>
396 __TBB_requires(parallel_for_index<Index> && parallel_for_function<Function, Index>)
397void parallel_for(Index first, Index last, Index step, const Function& f, task_group_context &context) {
398 parallel_for_impl<Index,Function,const auto_partitioner>(first, last, step, f, auto_partitioner(), context);
399}
400//! Parallel iteration over a range of integers with explicit step, task group context, and simple partitioner
401template <typename Index, typename Function>
402 __TBB_requires(parallel_for_index<Index> && parallel_for_function<Function, Index>)
403void parallel_for(Index first, Index last, Index step, const Function& f, const simple_partitioner& partitioner, task_group_context &context) {
404 parallel_for_impl<Index,Function,const simple_partitioner>(first, last, step, f, partitioner, context);
405}
406//! Parallel iteration over a range of integers with explicit step, task group context, and auto partitioner
407template <typename Index, typename Function>
408 __TBB_requires(parallel_for_index<Index> && parallel_for_function<Function, Index>)
409void parallel_for(Index first, Index last, Index step, const Function& f, const auto_partitioner& partitioner, task_group_context &context) {
410 parallel_for_impl<Index,Function,const auto_partitioner>(first, last, step, f, partitioner, context);
411}
412//! Parallel iteration over a range of integers with explicit step, task group context, and static partitioner
413template <typename Index, typename Function>
414 __TBB_requires(parallel_for_index<Index> && parallel_for_function<Function, Index>)
415void parallel_for(Index first, Index last, Index step, const Function& f, const static_partitioner& partitioner, task_group_context &context) {
416 parallel_for_impl<Index,Function,const static_partitioner>(first, last, step, f, partitioner, context);
417}
418//! Parallel iteration over a range of integers with explicit step, task group context, and affinity partitioner
419template <typename Index, typename Function>
420 __TBB_requires(parallel_for_index<Index> && parallel_for_function<Function, Index>)
421void parallel_for(Index first, Index last, Index step, const Function& f, affinity_partitioner& partitioner, task_group_context &context) {
422 parallel_for_impl(first, last, step, f, partitioner, context);
423}
424
425//! Parallel iteration over a range of integers with a default step value, explicit task group context, and default partitioner
426template <typename Index, typename Function>
427 __TBB_requires(parallel_for_index<Index> && parallel_for_function<Function, Index>)
428void parallel_for(Index first, Index last, const Function& f, task_group_context &context) {
429 parallel_for_impl<Index,Function,const auto_partitioner>(first, last, static_cast<Index>(1), f, auto_partitioner(), context);
430}
431//! Parallel iteration over a range of integers with a default step value, explicit task group context, and simple partitioner
432template <typename Index, typename Function>
433 __TBB_requires(parallel_for_index<Index> && parallel_for_function<Function, Index>)
434void parallel_for(Index first, Index last, const Function& f, const simple_partitioner& partitioner, task_group_context &context) {
435 parallel_for_impl<Index,Function,const simple_partitioner>(first, last, static_cast<Index>(1), f, partitioner, context);
436}
437//! Parallel iteration over a range of integers with a default step value, explicit task group context, and auto partitioner
438template <typename Index, typename Function>
439 __TBB_requires(parallel_for_index<Index> && parallel_for_function<Function, Index>)
440void parallel_for(Index first, Index last, const Function& f, const auto_partitioner& partitioner, task_group_context &context) {
441 parallel_for_impl<Index,Function,const auto_partitioner>(first, last, static_cast<Index>(1), f, partitioner, context);
442}
443//! Parallel iteration over a range of integers with a default step value, explicit task group context, and static partitioner
444template <typename Index, typename Function>
445 __TBB_requires(parallel_for_index<Index> && parallel_for_function<Function, Index>)
446void parallel_for(Index first, Index last, const Function& f, const static_partitioner& partitioner, task_group_context &context) {
447 parallel_for_impl<Index,Function,const static_partitioner>(first, last, static_cast<Index>(1), f, partitioner, context);
448}
449//! Parallel iteration over a range of integers with a default step value, explicit task group context, and affinity_partitioner
450template <typename Index, typename Function>
451 __TBB_requires(parallel_for_index<Index> && parallel_for_function<Function, Index>)
452void parallel_for(Index first, Index last, const Function& f, affinity_partitioner& partitioner, task_group_context &context) {
453 parallel_for_impl(first, last, static_cast<Index>(1), f, partitioner, context);
454}
455// @}
456
457} // namespace d1
458} // namespace detail
459
460inline namespace v1 {
461using detail::d1::parallel_for;
462// Split types
463using detail::split;
464using detail::proportional_split;
465} // namespace v1
466
467} // namespace tbb
468
469#endif /* __TBB_parallel_for_H */
470

source code of include/oneapi/tbb/parallel_for.h