1/*
2 Copyright (c) 2005-2021 Intel Corporation
3
4 Licensed under the Apache License, Version 2.0 (the "License");
5 you may not use this file except in compliance with the License.
6 You may obtain a copy of the License at
7
8 http://www.apache.org/licenses/LICENSE-2.0
9
10 Unless required by applicable law or agreed to in writing, software
11 distributed under the License is distributed on an "AS IS" BASIS,
12 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 See the License for the specific language governing permissions and
14 limitations under the License.
15*/
16
17#ifndef __TBB_parallel_for_H
18#define __TBB_parallel_for_H
19
20#include "detail/_config.h"
21#include "detail/_namespace_injection.h"
22#include "detail/_exception.h"
23#include "detail/_task.h"
24#include "detail/_small_object_pool.h"
25#include "profiling.h"
26
27#include "partitioner.h"
28#include "blocked_range.h"
29#include "task_group.h"
30
31#include <cstddef>
32#include <new>
33
34namespace tbb {
35namespace detail {
36#if __TBB_CPP20_CONCEPTS_PRESENT
37inline namespace d0 {
38
39template <typename Body, typename Range>
40concept parallel_for_body = std::copy_constructible<Body> &&
41 requires( const std::remove_reference_t<Body>& body, Range& range ) {
42 body(range);
43 };
44
45template <typename Index>
46concept parallel_for_index = std::constructible_from<Index, int> &&
47 std::copyable<Index> &&
48 requires( const std::remove_reference_t<Index>& lhs, const std::remove_reference_t<Index>& rhs ) {
49 { lhs < rhs } -> adaptive_same_as<bool>;
50 { lhs - rhs } -> std::convertible_to<std::size_t>;
51 { lhs + (rhs - lhs) } -> std::convertible_to<Index>;
52 };
53
54template <typename Function, typename Index>
55concept parallel_for_function = requires( const std::remove_reference_t<Function>& func, Index index ) {
56 func(index);
57};
58
59} // namespace d0
60#endif // __TBB_CPP20_CONCEPTS_PRESENT
61namespace d1 {
62
63//! Task type used in parallel_for
64/** @ingroup algorithms */
65template<typename Range, typename Body, typename Partitioner>
66struct start_for : public task {
67 Range my_range;
68 const Body my_body;
69 node* my_parent;
70
71 typename Partitioner::task_partition_type my_partition;
72 small_object_allocator my_allocator;
73
74 task* execute(execution_data&) override;
75 task* cancel(execution_data&) override;
76 void finalize(const execution_data&);
77
78 //! Constructor for root task.
79 start_for( const Range& range, const Body& body, Partitioner& partitioner, small_object_allocator& alloc ) :
80 my_range(range),
81 my_body(body),
82 my_partition(partitioner),
83 my_allocator(alloc) {}
84 //! Splitting constructor used to generate children.
85 /** parent_ becomes left child. Newly constructed object is right child. */
86 start_for( start_for& parent_, typename Partitioner::split_type& split_obj, small_object_allocator& alloc ) :
87 my_range(parent_.my_range, get_range_split_object<Range>(split_obj)),
88 my_body(parent_.my_body),
89 my_partition(parent_.my_partition, split_obj),
90 my_allocator(alloc) {}
91 //! Construct right child from the given range as response to the demand.
92 /** parent_ remains left child. Newly constructed object is right child. */
93 start_for( start_for& parent_, const Range& r, depth_t d, small_object_allocator& alloc ) :
94 my_range(r),
95 my_body(parent_.my_body),
96 my_partition(parent_.my_partition, split()),
97 my_allocator(alloc)
98 {
99 my_partition.align_depth( d );
100 }
101 static void run(const Range& range, const Body& body, Partitioner& partitioner) {
102 task_group_context context(PARALLEL_FOR);
103 run(range, body, partitioner, context);
104 }
105
106 static void run(const Range& range, const Body& body, Partitioner& partitioner, task_group_context& context) {
107 if ( !range.empty() ) {
108 small_object_allocator alloc{};
109 start_for& for_task = *alloc.new_object<start_for>(range, body, partitioner, alloc);
110
111 // defer creation of the wait node until task allocation succeeds
112 wait_node wn;
113 for_task.my_parent = &wn;
114 execute_and_wait(for_task, context, wn.m_wait, context);
115 }
116 }
117 //! Run body for range, serves as callback for partitioner
118 void run_body( Range &r ) {
119 my_body( r );
120 }
121
122 //! spawn right task, serves as callback for partitioner
123 void offer_work(typename Partitioner::split_type& split_obj, execution_data& ed) {
124 offer_work_impl(ed, *this, split_obj);
125 }
126
127 //! spawn right task, serves as callback for partitioner
128 void offer_work(const Range& r, depth_t d, execution_data& ed) {
129 offer_work_impl(ed, *this, r, d);
130 }
131
132private:
133 template <typename... Args>
134 void offer_work_impl(execution_data& ed, Args&&... constructor_args) {
135 // New right child
136 small_object_allocator alloc{};
137 start_for& right_child = *alloc.new_object<start_for>(ed, std::forward<Args>(constructor_args)..., alloc);
138
139 // New root node as a continuation and ref count. Left and right child attach to the new parent.
140 right_child.my_parent = my_parent = alloc.new_object<tree_node>(ed, args&: my_parent, args: 2, args&: alloc);
141 // Spawn the right sibling
142 right_child.spawn_self(ed);
143 }
144
145 void spawn_self(execution_data& ed) {
146 my_partition.spawn_task(*this, *context(ed));
147 }
148};
149
150//! fold the tree and deallocate the task
151template<typename Range, typename Body, typename Partitioner>
152void start_for<Range, Body, Partitioner>::finalize(const execution_data& ed) {
153 // Get the current parent and allocator an object destruction
154 node* parent = my_parent;
155 auto allocator = my_allocator;
156 // Task execution finished - destroy it
157 this->~start_for();
158 // Unwind the tree decrementing the parent`s reference count
159
160 fold_tree<tree_node>(n: parent, ed);
161 allocator.deallocate(this, ed);
162
163}
164
165//! execute task for parallel_for
166template<typename Range, typename Body, typename Partitioner>
167task* start_for<Range, Body, Partitioner>::execute(execution_data& ed) {
168 if (!is_same_affinity(ed)) {
169 my_partition.note_affinity(execution_slot(ed));
170 }
171 my_partition.check_being_stolen(*this, ed);
172 my_partition.execute(*this, my_range, ed);
173 finalize(ed);
174 return nullptr;
175}
176
177//! cancel task for parallel_for
178template<typename Range, typename Body, typename Partitioner>
179task* start_for<Range, Body, Partitioner>::cancel(execution_data& ed) {
180 finalize(ed);
181 return nullptr;
182}
183
184//! Calls the function with values from range [begin, end) with a step provided
185template<typename Function, typename Index>
186class parallel_for_body_wrapper : detail::no_assign {
187 const Function &my_func;
188 const Index my_begin;
189 const Index my_step;
190public:
191 parallel_for_body_wrapper( const Function& _func, Index& _begin, Index& _step )
192 : my_func(_func), my_begin(_begin), my_step(_step) {}
193
194 void operator()( const blocked_range<Index>& r ) const {
195 // A set of local variables to help the compiler with vectorization of the following loop.
196 Index b = r.begin();
197 Index e = r.end();
198 Index ms = my_step;
199 Index k = my_begin + b*ms;
200
201#if __INTEL_COMPILER
202#pragma ivdep
203#if __TBB_ASSERT_ON_VECTORIZATION_FAILURE
204#pragma vector always assert
205#endif
206#endif
207 for ( Index i = b; i < e; ++i, k += ms ) {
208 my_func( k );
209 }
210 }
211};
212
213// Requirements on Range concept are documented in blocked_range.h
214
215/** \page parallel_for_body_req Requirements on parallel_for body
216 Class \c Body implementing the concept of parallel_for body must define:
217 - \code Body::Body( const Body& ); \endcode Copy constructor
218 - \code Body::~Body(); \endcode Destructor
219 - \code void Body::operator()( Range& r ) const; \endcode Function call operator applying the body to range \c r.
220**/
221
222/** \name parallel_for
223 See also requirements on \ref range_req "Range" and \ref parallel_for_body_req "parallel_for Body". **/
224//@{
225
226//! Parallel iteration over range with default partitioner.
227/** @ingroup algorithms **/
228template<typename Range, typename Body>
229 __TBB_requires(tbb_range<Range> && parallel_for_body<Body, Range>)
230void parallel_for( const Range& range, const Body& body ) {
231 start_for<Range,Body,const __TBB_DEFAULT_PARTITIONER>::run(range,body,__TBB_DEFAULT_PARTITIONER());
232}
233
234//! Parallel iteration over range with simple partitioner.
235/** @ingroup algorithms **/
236template<typename Range, typename Body>
237 __TBB_requires(tbb_range<Range> && parallel_for_body<Body, Range>)
238void parallel_for( const Range& range, const Body& body, const simple_partitioner& partitioner ) {
239 start_for<Range,Body,const simple_partitioner>::run(range,body,partitioner);
240}
241
242//! Parallel iteration over range with auto_partitioner.
243/** @ingroup algorithms **/
244template<typename Range, typename Body>
245 __TBB_requires(tbb_range<Range> && parallel_for_body<Body, Range>)
246void parallel_for( const Range& range, const Body& body, const auto_partitioner& partitioner ) {
247 start_for<Range,Body,const auto_partitioner>::run(range,body,partitioner);
248}
249
250//! Parallel iteration over range with static_partitioner.
251/** @ingroup algorithms **/
252template<typename Range, typename Body>
253 __TBB_requires(tbb_range<Range> && parallel_for_body<Body, Range>)
254void parallel_for( const Range& range, const Body& body, const static_partitioner& partitioner ) {
255 start_for<Range,Body,const static_partitioner>::run(range,body,partitioner);
256}
257
258//! Parallel iteration over range with affinity_partitioner.
259/** @ingroup algorithms **/
260template<typename Range, typename Body>
261 __TBB_requires(tbb_range<Range> && parallel_for_body<Body, Range>)
262void parallel_for( const Range& range, const Body& body, affinity_partitioner& partitioner ) {
263 start_for<Range,Body,affinity_partitioner>::run(range,body,partitioner);
264}
265
266//! Parallel iteration over range with default partitioner and user-supplied context.
267/** @ingroup algorithms **/
268template<typename Range, typename Body>
269 __TBB_requires(tbb_range<Range> && parallel_for_body<Body, Range>)
270void parallel_for( const Range& range, const Body& body, task_group_context& context ) {
271 start_for<Range,Body,const __TBB_DEFAULT_PARTITIONER>::run(range, body, __TBB_DEFAULT_PARTITIONER(), context);
272}
273
274//! Parallel iteration over range with simple partitioner and user-supplied context.
275/** @ingroup algorithms **/
276template<typename Range, typename Body>
277 __TBB_requires(tbb_range<Range> && parallel_for_body<Body, Range>)
278void parallel_for( const Range& range, const Body& body, const simple_partitioner& partitioner, task_group_context& context ) {
279 start_for<Range,Body,const simple_partitioner>::run(range, body, partitioner, context);
280}
281
282//! Parallel iteration over range with auto_partitioner and user-supplied context.
283/** @ingroup algorithms **/
284template<typename Range, typename Body>
285 __TBB_requires(tbb_range<Range> && parallel_for_body<Body, Range>)
286void parallel_for( const Range& range, const Body& body, const auto_partitioner& partitioner, task_group_context& context ) {
287 start_for<Range,Body,const auto_partitioner>::run(range, body, partitioner, context);
288}
289
290//! Parallel iteration over range with static_partitioner and user-supplied context.
291/** @ingroup algorithms **/
292template<typename Range, typename Body>
293 __TBB_requires(tbb_range<Range> && parallel_for_body<Body, Range>)
294void parallel_for( const Range& range, const Body& body, const static_partitioner& partitioner, task_group_context& context ) {
295 start_for<Range,Body,const static_partitioner>::run(range, body, partitioner, context);
296}
297
298//! Parallel iteration over range with affinity_partitioner and user-supplied context.
299/** @ingroup algorithms **/
300template<typename Range, typename Body>
301 __TBB_requires(tbb_range<Range> && parallel_for_body<Body, Range>)
302void parallel_for( const Range& range, const Body& body, affinity_partitioner& partitioner, task_group_context& context ) {
303 start_for<Range,Body,affinity_partitioner>::run(range,body,partitioner, context);
304}
305
306//! Implementation of parallel iteration over stepped range of integers with explicit step and partitioner
307template <typename Index, typename Function, typename Partitioner>
308void parallel_for_impl(Index first, Index last, Index step, const Function& f, Partitioner& partitioner) {
309 if (step <= 0 )
310 throw_exception(exception_id::nonpositive_step); // throws std::invalid_argument
311 else if (first < last) {
312 // Above "else" avoids "potential divide by zero" warning on some platforms
313 Index end = (last - first - Index(1)) / step + Index(1);
314 blocked_range<Index> range(static_cast<Index>(0), end);
315 parallel_for_body_wrapper<Function, Index> body(f, first, step);
316 parallel_for(range, body, partitioner);
317 }
318}
319
320//! Parallel iteration over a range of integers with a step provided and default partitioner
321template <typename Index, typename Function>
322 __TBB_requires(parallel_for_index<Index> && parallel_for_function<Function, Index>)
323void parallel_for(Index first, Index last, Index step, const Function& f) {
324 parallel_for_impl<Index,Function,const auto_partitioner>(first, last, step, f, auto_partitioner());
325}
326//! Parallel iteration over a range of integers with a step provided and simple partitioner
327template <typename Index, typename Function>
328 __TBB_requires(parallel_for_index<Index> && parallel_for_function<Function, Index>)
329void parallel_for(Index first, Index last, Index step, const Function& f, const simple_partitioner& partitioner) {
330 parallel_for_impl<Index,Function,const simple_partitioner>(first, last, step, f, partitioner);
331}
332//! Parallel iteration over a range of integers with a step provided and auto partitioner
333template <typename Index, typename Function>
334 __TBB_requires(parallel_for_index<Index> && parallel_for_function<Function, Index>)
335void parallel_for(Index first, Index last, Index step, const Function& f, const auto_partitioner& partitioner) {
336 parallel_for_impl<Index,Function,const auto_partitioner>(first, last, step, f, partitioner);
337}
338//! Parallel iteration over a range of integers with a step provided and static partitioner
339template <typename Index, typename Function>
340 __TBB_requires(parallel_for_index<Index> && parallel_for_function<Function, Index>)
341void parallel_for(Index first, Index last, Index step, const Function& f, const static_partitioner& partitioner) {
342 parallel_for_impl<Index,Function,const static_partitioner>(first, last, step, f, partitioner);
343}
344//! Parallel iteration over a range of integers with a step provided and affinity partitioner
345template <typename Index, typename Function>
346 __TBB_requires(parallel_for_index<Index> && parallel_for_function<Function, Index>)
347void parallel_for(Index first, Index last, Index step, const Function& f, affinity_partitioner& partitioner) {
348 parallel_for_impl(first, last, step, f, partitioner);
349}
350
351//! Parallel iteration over a range of integers with a default step value and default partitioner
352template <typename Index, typename Function>
353 __TBB_requires(parallel_for_index<Index> && parallel_for_function<Function, Index>)
354void parallel_for(Index first, Index last, const Function& f) {
355 parallel_for_impl<Index,Function,const auto_partitioner>(first, last, static_cast<Index>(1), f, auto_partitioner());
356}
357//! Parallel iteration over a range of integers with a default step value and simple partitioner
358template <typename Index, typename Function>
359 __TBB_requires(parallel_for_index<Index> && parallel_for_function<Function, Index>)
360void parallel_for(Index first, Index last, const Function& f, const simple_partitioner& partitioner) {
361 parallel_for_impl<Index,Function,const simple_partitioner>(first, last, static_cast<Index>(1), f, partitioner);
362}
363//! Parallel iteration over a range of integers with a default step value and auto partitioner
364template <typename Index, typename Function>
365 __TBB_requires(parallel_for_index<Index> && parallel_for_function<Function, Index>)
366void parallel_for(Index first, Index last, const Function& f, const auto_partitioner& partitioner) {
367 parallel_for_impl<Index,Function,const auto_partitioner>(first, last, static_cast<Index>(1), f, partitioner);
368}
369//! Parallel iteration over a range of integers with a default step value and static partitioner
370template <typename Index, typename Function>
371 __TBB_requires(parallel_for_index<Index> && parallel_for_function<Function, Index>)
372void parallel_for(Index first, Index last, const Function& f, const static_partitioner& partitioner) {
373 parallel_for_impl<Index,Function,const static_partitioner>(first, last, static_cast<Index>(1), f, partitioner);
374}
375//! Parallel iteration over a range of integers with a default step value and affinity partitioner
376template <typename Index, typename Function>
377 __TBB_requires(parallel_for_index<Index> && parallel_for_function<Function, Index>)
378void parallel_for(Index first, Index last, const Function& f, affinity_partitioner& partitioner) {
379 parallel_for_impl(first, last, static_cast<Index>(1), f, partitioner);
380}
381
382//! Implementation of parallel iteration over stepped range of integers with explicit step, task group context, and partitioner
383template <typename Index, typename Function, typename Partitioner>
384void parallel_for_impl(Index first, Index last, Index step, const Function& f, Partitioner& partitioner, task_group_context &context) {
385 if (step <= 0 )
386 throw_exception(exception_id::nonpositive_step); // throws std::invalid_argument
387 else if (first < last) {
388 // Above "else" avoids "potential divide by zero" warning on some platforms
389 Index end = (last - first - Index(1)) / step + Index(1);
390 blocked_range<Index> range(static_cast<Index>(0), end);
391 parallel_for_body_wrapper<Function, Index> body(f, first, step);
392 parallel_for(range, body, partitioner, context);
393 }
394}
395
396//! Parallel iteration over a range of integers with explicit step, task group context, and default partitioner
397template <typename Index, typename Function>
398 __TBB_requires(parallel_for_index<Index> && parallel_for_function<Function, Index>)
399void parallel_for(Index first, Index last, Index step, const Function& f, task_group_context &context) {
400 parallel_for_impl<Index,Function,const auto_partitioner>(first, last, step, f, auto_partitioner(), context);
401}
402//! Parallel iteration over a range of integers with explicit step, task group context, and simple partitioner
403template <typename Index, typename Function>
404 __TBB_requires(parallel_for_index<Index> && parallel_for_function<Function, Index>)
405void parallel_for(Index first, Index last, Index step, const Function& f, const simple_partitioner& partitioner, task_group_context &context) {
406 parallel_for_impl<Index,Function,const simple_partitioner>(first, last, step, f, partitioner, context);
407}
408//! Parallel iteration over a range of integers with explicit step, task group context, and auto partitioner
409template <typename Index, typename Function>
410 __TBB_requires(parallel_for_index<Index> && parallel_for_function<Function, Index>)
411void parallel_for(Index first, Index last, Index step, const Function& f, const auto_partitioner& partitioner, task_group_context &context) {
412 parallel_for_impl<Index,Function,const auto_partitioner>(first, last, step, f, partitioner, context);
413}
414//! Parallel iteration over a range of integers with explicit step, task group context, and static partitioner
415template <typename Index, typename Function>
416 __TBB_requires(parallel_for_index<Index> && parallel_for_function<Function, Index>)
417void parallel_for(Index first, Index last, Index step, const Function& f, const static_partitioner& partitioner, task_group_context &context) {
418 parallel_for_impl<Index,Function,const static_partitioner>(first, last, step, f, partitioner, context);
419}
420//! Parallel iteration over a range of integers with explicit step, task group context, and affinity partitioner
421template <typename Index, typename Function>
422 __TBB_requires(parallel_for_index<Index> && parallel_for_function<Function, Index>)
423void parallel_for(Index first, Index last, Index step, const Function& f, affinity_partitioner& partitioner, task_group_context &context) {
424 parallel_for_impl(first, last, step, f, partitioner, context);
425}
426
427//! Parallel iteration over a range of integers with a default step value, explicit task group context, and default partitioner
428template <typename Index, typename Function>
429 __TBB_requires(parallel_for_index<Index> && parallel_for_function<Function, Index>)
430void parallel_for(Index first, Index last, const Function& f, task_group_context &context) {
431 parallel_for_impl<Index,Function,const auto_partitioner>(first, last, static_cast<Index>(1), f, auto_partitioner(), context);
432}
433//! Parallel iteration over a range of integers with a default step value, explicit task group context, and simple partitioner
434template <typename Index, typename Function>
435 __TBB_requires(parallel_for_index<Index> && parallel_for_function<Function, Index>)
436void parallel_for(Index first, Index last, const Function& f, const simple_partitioner& partitioner, task_group_context &context) {
437 parallel_for_impl<Index,Function,const simple_partitioner>(first, last, static_cast<Index>(1), f, partitioner, context);
438}
439//! Parallel iteration over a range of integers with a default step value, explicit task group context, and auto partitioner
440template <typename Index, typename Function>
441 __TBB_requires(parallel_for_index<Index> && parallel_for_function<Function, Index>)
442void parallel_for(Index first, Index last, const Function& f, const auto_partitioner& partitioner, task_group_context &context) {
443 parallel_for_impl<Index,Function,const auto_partitioner>(first, last, static_cast<Index>(1), f, partitioner, context);
444}
445//! Parallel iteration over a range of integers with a default step value, explicit task group context, and static partitioner
446template <typename Index, typename Function>
447 __TBB_requires(parallel_for_index<Index> && parallel_for_function<Function, Index>)
448void parallel_for(Index first, Index last, const Function& f, const static_partitioner& partitioner, task_group_context &context) {
449 parallel_for_impl<Index,Function,const static_partitioner>(first, last, static_cast<Index>(1), f, partitioner, context);
450}
451//! Parallel iteration over a range of integers with a default step value, explicit task group context, and affinity_partitioner
452template <typename Index, typename Function>
453 __TBB_requires(parallel_for_index<Index> && parallel_for_function<Function, Index>)
454void parallel_for(Index first, Index last, const Function& f, affinity_partitioner& partitioner, task_group_context &context) {
455 parallel_for_impl(first, last, static_cast<Index>(1), f, partitioner, context);
456}
457// @}
458
459} // namespace d1
460} // namespace detail
461
462inline namespace v1 {
463using detail::d1::parallel_for;
464// Split types
465using detail::split;
466using detail::proportional_split;
467} // namespace v1
468
469} // namespace tbb
470
471#endif /* __TBB_parallel_for_H */
472

source code of include/oneapi/tbb/parallel_for.h