1 | /* |
2 | Copyright (c) 2005-2021 Intel Corporation |
3 | |
4 | Licensed under the Apache License, Version 2.0 (the "License"); |
5 | you may not use this file except in compliance with the License. |
6 | You may obtain a copy of the License at |
7 | |
8 | http://www.apache.org/licenses/LICENSE-2.0 |
9 | |
10 | Unless required by applicable law or agreed to in writing, software |
11 | distributed under the License is distributed on an "AS IS" BASIS, |
12 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
13 | See the License for the specific language governing permissions and |
14 | limitations under the License. |
15 | */ |
16 | |
17 | #ifndef __TBB_partitioner_H |
18 | #define __TBB_partitioner_H |
19 | |
20 | #ifndef __TBB_INITIAL_CHUNKS |
21 | // initial task divisions per thread |
22 | #define __TBB_INITIAL_CHUNKS 2 |
23 | #endif |
24 | #ifndef __TBB_RANGE_POOL_CAPACITY |
25 | // maximum number of elements in range pool |
26 | #define __TBB_RANGE_POOL_CAPACITY 8 |
27 | #endif |
28 | #ifndef __TBB_INIT_DEPTH |
29 | // initial value for depth of range pool |
30 | #define __TBB_INIT_DEPTH 5 |
31 | #endif |
32 | #ifndef __TBB_DEMAND_DEPTH_ADD |
33 | // when imbalance is found range splits this value times more |
34 | #define __TBB_DEMAND_DEPTH_ADD 1 |
35 | #endif |
36 | |
37 | #include "detail/_config.h" |
38 | #include "detail/_namespace_injection.h" |
39 | #include "detail/_aligned_space.h" |
40 | #include "detail/_utils.h" |
41 | #include "detail/_template_helpers.h" |
42 | #include "detail/_range_common.h" |
43 | #include "detail/_task.h" |
44 | #include "detail/_small_object_pool.h" |
45 | |
46 | #include "cache_aligned_allocator.h" |
47 | #include "task_group.h" // task_group_context |
48 | #include "task_arena.h" |
49 | |
50 | #include <algorithm> |
51 | #include <atomic> |
52 | #include <type_traits> |
53 | |
54 | #if defined(_MSC_VER) && !defined(__INTEL_COMPILER) |
55 | // Workaround for overzealous compiler warnings |
56 | #pragma warning (push) |
57 | #pragma warning (disable: 4244) |
58 | #endif |
59 | |
60 | namespace tbb { |
61 | namespace detail { |
62 | |
63 | namespace d1 { |
64 | class auto_partitioner; |
65 | class simple_partitioner; |
66 | class static_partitioner; |
67 | class affinity_partitioner; |
68 | class affinity_partition_type; |
69 | class affinity_partitioner_base; |
70 | |
71 | inline std::size_t get_initial_auto_partitioner_divisor() { |
72 | const std::size_t factor = 4; |
73 | return factor * max_concurrency(); |
74 | } |
75 | |
76 | //! Defines entry point for affinity partitioner into oneTBB run-time library. |
77 | class affinity_partitioner_base: no_copy { |
78 | friend class affinity_partitioner; |
79 | friend class affinity_partition_type; |
80 | //! Array that remembers affinities of tree positions to affinity_id. |
81 | /** NULL if my_size==0. */ |
82 | slot_id* my_array; |
83 | //! Number of elements in my_array. |
84 | std::size_t my_size; |
85 | //! Zeros the fields. |
86 | affinity_partitioner_base() : my_array(nullptr), my_size(0) {} |
87 | //! Deallocates my_array. |
88 | ~affinity_partitioner_base() { resize(factor: 0); } |
89 | //! Resize my_array. |
90 | /** Retains values if resulting size is the same. */ |
91 | void resize(unsigned factor) { |
92 | // Check factor to avoid asking for number of workers while there might be no arena. |
93 | unsigned max_threads_in_arena = max_concurrency(); |
94 | std::size_t new_size = factor ? factor * max_threads_in_arena : 0; |
95 | if (new_size != my_size) { |
96 | if (my_array) { |
97 | r1::cache_aligned_deallocate(p: my_array); |
98 | // Following two assignments must be done here for sake of exception safety. |
99 | my_array = nullptr; |
100 | my_size = 0; |
101 | } |
102 | if (new_size) { |
103 | my_array = static_cast<slot_id*>(r1::cache_aligned_allocate(size: new_size * sizeof(slot_id))); |
104 | std::fill_n(first: my_array, n: new_size, value: no_slot); |
105 | my_size = new_size; |
106 | } |
107 | } |
108 | } |
109 | }; |
110 | |
111 | template<typename Range, typename Body, typename Partitioner> struct start_for; |
112 | template<typename Range, typename Body, typename Partitioner> struct start_scan; |
113 | template<typename Range, typename Body, typename Partitioner> struct start_reduce; |
114 | template<typename Range, typename Body, typename Partitioner> struct start_deterministic_reduce; |
115 | |
116 | struct node { |
117 | node* my_parent{}; |
118 | std::atomic<int> m_ref_count{}; |
119 | |
120 | node() = default; |
121 | node(node* parent, int ref_count) : |
122 | my_parent{parent}, m_ref_count{ref_count} { |
123 | __TBB_ASSERT(ref_count > 0, "The ref count must be positive" ); |
124 | } |
125 | }; |
126 | |
127 | struct wait_node : node { |
128 | wait_node() : node{ nullptr, 1 } {} |
129 | wait_context m_wait{1}; |
130 | }; |
131 | |
132 | //! Join task node that contains shared flag for stealing feedback |
133 | struct tree_node : public node { |
134 | small_object_allocator m_allocator; |
135 | std::atomic<bool> m_child_stolen{false}; |
136 | |
137 | tree_node(node* parent, int ref_count, small_object_allocator& alloc) |
138 | : node{parent, ref_count} |
139 | , m_allocator{alloc} {} |
140 | |
141 | void join(task_group_context*) {/*dummy, required only for reduction algorithms*/}; |
142 | |
143 | template <typename Task> |
144 | static void mark_task_stolen(Task &t) { |
145 | std::atomic<bool> &flag = static_cast<tree_node*>(t.my_parent)->m_child_stolen; |
146 | #if TBB_USE_PROFILING_TOOLS |
147 | // Threading tools respect lock prefix but report false-positive data-race via plain store |
148 | flag.exchange(true); |
149 | #else |
150 | flag.store(i: true, m: std::memory_order_relaxed); |
151 | #endif // TBB_USE_PROFILING_TOOLS |
152 | } |
153 | template <typename Task> |
154 | static bool is_peer_stolen(Task &t) { |
155 | return static_cast<tree_node*>(t.my_parent)->m_child_stolen.load(m: std::memory_order_relaxed); |
156 | } |
157 | }; |
158 | |
159 | // Context used to check cancellation state during reduction join process |
160 | template<typename TreeNodeType> |
161 | void fold_tree(node* n, const execution_data& ed) { |
162 | for (;;) { |
163 | __TBB_ASSERT(n->m_ref_count.load(std::memory_order_relaxed) > 0, "The refcount must be positive." ); |
164 | call_itt_task_notify(releasing, n); |
165 | if (--n->m_ref_count > 0) { |
166 | return; |
167 | } |
168 | node* parent = n->my_parent; |
169 | if (!parent) { |
170 | break; |
171 | }; |
172 | |
173 | call_itt_task_notify(acquired, n); |
174 | TreeNodeType* self = static_cast<TreeNodeType*>(n); |
175 | self->join(ed.context); |
176 | self->m_allocator.delete_object(self, ed); |
177 | n = parent; |
178 | } |
179 | // Finish parallel for execution when the root (last node) is reached |
180 | static_cast<wait_node*>(n)->m_wait.release(); |
181 | } |
182 | |
183 | //! Depth is a relative depth of recursive division inside a range pool. Relative depth allows |
184 | //! infinite absolute depth of the recursion for heavily unbalanced workloads with range represented |
185 | //! by a number that cannot fit into machine word. |
186 | typedef unsigned char depth_t; |
187 | |
188 | //! Range pool stores ranges of type T in a circular buffer with MaxCapacity |
189 | template <typename T, depth_t MaxCapacity> |
190 | class range_vector { |
191 | depth_t my_head; |
192 | depth_t my_tail; |
193 | depth_t my_size; |
194 | depth_t my_depth[MaxCapacity]; // relative depths of stored ranges |
195 | tbb::detail::aligned_space<T, MaxCapacity> my_pool; |
196 | |
197 | public: |
198 | //! initialize via first range in pool |
199 | range_vector(const T& elem) : my_head(0), my_tail(0), my_size(1) { |
200 | my_depth[0] = 0; |
201 | new( static_cast<void *>(my_pool.begin()) ) T(elem);//TODO: std::move? |
202 | } |
203 | ~range_vector() { |
204 | while( !empty() ) pop_back(); |
205 | } |
206 | bool empty() const { return my_size == 0; } |
207 | depth_t size() const { return my_size; } |
208 | //! Populates range pool via ranges up to max depth or while divisible |
209 | //! max_depth starts from 0, e.g. value 2 makes 3 ranges in the pool up to two 1/4 pieces |
210 | void split_to_fill(depth_t max_depth) { |
211 | while( my_size < MaxCapacity && is_divisible(max_depth) ) { |
212 | depth_t prev = my_head; |
213 | my_head = (my_head + 1) % MaxCapacity; |
214 | new(my_pool.begin()+my_head) T(my_pool.begin()[prev]); // copy TODO: std::move? |
215 | my_pool.begin()[prev].~T(); // instead of assignment |
216 | new(my_pool.begin()+prev) T(my_pool.begin()[my_head], detail::split()); // do 'inverse' split |
217 | my_depth[my_head] = ++my_depth[prev]; |
218 | my_size++; |
219 | } |
220 | } |
221 | void pop_back() { |
222 | __TBB_ASSERT(my_size > 0, "range_vector::pop_back() with empty size" ); |
223 | my_pool.begin()[my_head].~T(); |
224 | my_size--; |
225 | my_head = (my_head + MaxCapacity - 1) % MaxCapacity; |
226 | } |
227 | void pop_front() { |
228 | __TBB_ASSERT(my_size > 0, "range_vector::pop_front() with empty size" ); |
229 | my_pool.begin()[my_tail].~T(); |
230 | my_size--; |
231 | my_tail = (my_tail + 1) % MaxCapacity; |
232 | } |
233 | T& back() { |
234 | __TBB_ASSERT(my_size > 0, "range_vector::back() with empty size" ); |
235 | return my_pool.begin()[my_head]; |
236 | } |
237 | T& front() { |
238 | __TBB_ASSERT(my_size > 0, "range_vector::front() with empty size" ); |
239 | return my_pool.begin()[my_tail]; |
240 | } |
241 | //! similarly to front(), returns depth of the first range in the pool |
242 | depth_t front_depth() { |
243 | __TBB_ASSERT(my_size > 0, "range_vector::front_depth() with empty size" ); |
244 | return my_depth[my_tail]; |
245 | } |
246 | depth_t back_depth() { |
247 | __TBB_ASSERT(my_size > 0, "range_vector::back_depth() with empty size" ); |
248 | return my_depth[my_head]; |
249 | } |
250 | bool is_divisible(depth_t max_depth) { |
251 | return back_depth() < max_depth && back().is_divisible(); |
252 | } |
253 | }; |
254 | |
255 | //! Provides default methods for partition objects and common algorithm blocks. |
256 | template <typename Partition> |
257 | struct partition_type_base { |
258 | typedef detail::split split_type; |
259 | // decision makers |
260 | void note_affinity( slot_id ) {} |
261 | template <typename Task> |
262 | bool check_being_stolen(Task&, const execution_data&) { return false; } // part of old should_execute_range() |
263 | template <typename Range> split_type get_split() { return split(); } |
264 | Partition& self() { return *static_cast<Partition*>(this); } // CRTP helper |
265 | |
266 | template<typename StartType, typename Range> |
267 | void work_balance(StartType &start, Range &range, const execution_data&) { |
268 | start.run_body( range ); // static partitioner goes here |
269 | } |
270 | |
271 | template<typename StartType, typename Range> |
272 | void execute(StartType &start, Range &range, execution_data& ed) { |
273 | // The algorithm in a few words ([]-denotes calls to decision methods of partitioner): |
274 | // [If this task is stolen, adjust depth and divisions if necessary, set flag]. |
275 | // If range is divisible { |
276 | // Spread the work while [initial divisions left]; |
277 | // Create trap task [if necessary]; |
278 | // } |
279 | // If not divisible or [max depth is reached], execute, else do the range pool part |
280 | if ( range.is_divisible() ) { |
281 | if ( self().is_divisible() ) { |
282 | do { // split until is divisible |
283 | typename Partition::split_type split_obj = self().template get_split<Range>(); |
284 | start.offer_work( split_obj, ed ); |
285 | } while ( range.is_divisible() && self().is_divisible() ); |
286 | } |
287 | } |
288 | self().work_balance(start, range, ed); |
289 | } |
290 | }; |
291 | |
292 | //! Provides default splitting strategy for partition objects. |
293 | template <typename Partition> |
294 | struct adaptive_mode : partition_type_base<Partition> { |
295 | typedef Partition my_partition; |
296 | std::size_t my_divisor; |
297 | // For affinity_partitioner, my_divisor indicates the number of affinity array indices the task reserves. |
298 | // A task which has only one index must produce the right split without reserved index in order to avoid |
299 | // it to be overwritten in note_affinity() of the created (right) task. |
300 | // I.e. a task created deeper than the affinity array can remember must not save its affinity (LIFO order) |
301 | static const unsigned factor = 1; |
302 | adaptive_mode() : my_divisor(get_initial_auto_partitioner_divisor() / 4 * my_partition::factor) {} |
303 | adaptive_mode(adaptive_mode &src, split) : my_divisor(do_split(src, split())) {} |
304 | adaptive_mode(adaptive_mode&, const proportional_split&) : my_divisor(0) |
305 | { |
306 | // left blank as my_divisor gets overridden in the successors' constructors |
307 | } |
308 | /*! Override do_split methods in order to specify splitting strategy */ |
309 | std::size_t do_split(adaptive_mode &src, split) { |
310 | return src.my_divisor /= 2u; |
311 | } |
312 | }; |
313 | |
314 | //! Helper type for checking availability of proportional_split constructor |
315 | template <typename T> using supports_proportional_splitting = typename std::is_constructible<T, T&, proportional_split&>; |
316 | |
317 | //! A helper class to create a proportional_split object for a given type of Range. |
318 | /** If the Range has proportional_split constructor, |
319 | then created object splits a provided value in an implemenation-defined proportion; |
320 | otherwise it represents equal-size split. */ |
321 | // TODO: check if this helper can be a nested class of proportional_mode. |
322 | template <typename Range, typename = void> |
323 | struct proportion_helper { |
324 | static proportional_split get_split(std::size_t) { return proportional_split(1,1); } |
325 | }; |
326 | |
327 | template <typename Range> |
328 | struct proportion_helper<Range, typename std::enable_if<supports_proportional_splitting<Range>::value>::type> { |
329 | static proportional_split get_split(std::size_t n) { |
330 | std::size_t right = n / 2; |
331 | std::size_t left = n - right; |
332 | return proportional_split(left, right); |
333 | } |
334 | }; |
335 | |
336 | //! Provides proportional splitting strategy for partition objects |
337 | template <typename Partition> |
338 | struct proportional_mode : adaptive_mode<Partition> { |
339 | typedef Partition my_partition; |
340 | using partition_type_base<Partition>::self; // CRTP helper to get access to derived classes |
341 | |
342 | proportional_mode() : adaptive_mode<Partition>() {} |
343 | proportional_mode(proportional_mode &src, split) : adaptive_mode<Partition>(src, split()) {} |
344 | proportional_mode(proportional_mode &src, const proportional_split& split_obj) |
345 | : adaptive_mode<Partition>(src, split_obj) |
346 | { |
347 | self().my_divisor = do_split(src, split_obj); |
348 | } |
349 | std::size_t do_split(proportional_mode &src, const proportional_split& split_obj) { |
350 | std::size_t portion = split_obj.right() * my_partition::factor; |
351 | portion = (portion + my_partition::factor/2) & (0ul - my_partition::factor); |
352 | src.my_divisor -= portion; |
353 | return portion; |
354 | } |
355 | bool is_divisible() { // part of old should_execute_range() |
356 | return self().my_divisor > my_partition::factor; |
357 | } |
358 | template <typename Range> |
359 | proportional_split get_split() { |
360 | // Create a proportion for the number of threads expected to handle "this" subrange |
361 | return proportion_helper<Range>::get_split( self().my_divisor / my_partition::factor ); |
362 | } |
363 | }; |
364 | |
365 | static std::size_t get_initial_partition_head() { |
366 | int current_index = tbb::this_task_arena::current_thread_index(); |
367 | if (current_index == tbb::task_arena::not_initialized) |
368 | current_index = 0; |
369 | return size_t(current_index); |
370 | } |
371 | |
372 | //! Provides default linear indexing of partitioner's sequence |
373 | template <typename Partition> |
374 | struct linear_affinity_mode : proportional_mode<Partition> { |
375 | std::size_t my_head; |
376 | std::size_t my_max_affinity; |
377 | using proportional_mode<Partition>::self; |
378 | linear_affinity_mode() : proportional_mode<Partition>(), my_head(get_initial_partition_head()), |
379 | my_max_affinity(self().my_divisor) {} |
380 | linear_affinity_mode(linear_affinity_mode &src, split) : proportional_mode<Partition>(src, split()) |
381 | , my_head((src.my_head + src.my_divisor) % src.my_max_affinity), my_max_affinity(src.my_max_affinity) {} |
382 | linear_affinity_mode(linear_affinity_mode &src, const proportional_split& split_obj) : proportional_mode<Partition>(src, split_obj) |
383 | , my_head((src.my_head + src.my_divisor) % src.my_max_affinity), my_max_affinity(src.my_max_affinity) {} |
384 | void spawn_task(task& t, task_group_context& ctx) { |
385 | if (self().my_divisor) { |
386 | spawn(t, ctx, id: slot_id(my_head)); |
387 | } else { |
388 | spawn(t, ctx); |
389 | } |
390 | } |
391 | }; |
392 | |
393 | static bool is_stolen_task(const execution_data& ed) { |
394 | return execution_slot(ed) != original_slot(ed); |
395 | } |
396 | |
397 | /*! Determine work-balance phase implementing splitting & stealing actions */ |
398 | template<class Mode> |
399 | struct dynamic_grainsize_mode : Mode { |
400 | using Mode::self; |
401 | enum { |
402 | begin = 0, |
403 | run, |
404 | pass |
405 | } my_delay; |
406 | depth_t my_max_depth; |
407 | static const unsigned range_pool_size = __TBB_RANGE_POOL_CAPACITY; |
408 | dynamic_grainsize_mode(): Mode() |
409 | , my_delay(begin) |
410 | , my_max_depth(__TBB_INIT_DEPTH) {} |
411 | dynamic_grainsize_mode(dynamic_grainsize_mode& p, split) |
412 | : Mode(p, split()) |
413 | , my_delay(pass) |
414 | , my_max_depth(p.my_max_depth) {} |
415 | dynamic_grainsize_mode(dynamic_grainsize_mode& p, const proportional_split& split_obj) |
416 | : Mode(p, split_obj) |
417 | , my_delay(begin) |
418 | , my_max_depth(p.my_max_depth) {} |
419 | template <typename Task> |
420 | bool check_being_stolen(Task &t, const execution_data& ed) { // part of old should_execute_range() |
421 | if( !(self().my_divisor / Mode::my_partition::factor) ) { // if not from the top P tasks of binary tree |
422 | self().my_divisor = 1; // TODO: replace by on-stack flag (partition_state's member)? |
423 | if( is_stolen_task(ed) && t.my_parent->m_ref_count >= 2 ) { // runs concurrently with the left task |
424 | #if __TBB_USE_OPTIONAL_RTTI |
425 | // RTTI is available, check whether the cast is valid |
426 | // TODO: TBB_REVAMP_TODO __TBB_ASSERT(dynamic_cast<tree_node*>(t.m_parent), 0); |
427 | // correctness of the cast relies on avoiding the root task for which: |
428 | // - initial value of my_divisor != 0 (protected by separate assertion) |
429 | // - is_stolen_task() always returns false for the root task. |
430 | #endif |
431 | tree_node::mark_task_stolen(t); |
432 | if( !my_max_depth ) my_max_depth++; |
433 | my_max_depth += __TBB_DEMAND_DEPTH_ADD; |
434 | return true; |
435 | } |
436 | } |
437 | return false; |
438 | } |
439 | depth_t max_depth() { return my_max_depth; } |
440 | void align_depth(depth_t base) { |
441 | __TBB_ASSERT(base <= my_max_depth, 0); |
442 | my_max_depth -= base; |
443 | } |
444 | template<typename StartType, typename Range> |
445 | void work_balance(StartType &start, Range &range, execution_data& ed) { |
446 | if( !range.is_divisible() || !self().max_depth() ) { |
447 | start.run_body( range ); |
448 | } |
449 | else { // do range pool |
450 | range_vector<Range, range_pool_size> range_pool(range); |
451 | do { |
452 | range_pool.split_to_fill(self().max_depth()); // fill range pool |
453 | if( self().check_for_demand( start ) ) { |
454 | if( range_pool.size() > 1 ) { |
455 | start.offer_work( range_pool.front(), range_pool.front_depth(), ed ); |
456 | range_pool.pop_front(); |
457 | continue; |
458 | } |
459 | if( range_pool.is_divisible(self().max_depth()) ) // was not enough depth to fork a task |
460 | continue; // note: next split_to_fill() should split range at least once |
461 | } |
462 | start.run_body( range_pool.back() ); |
463 | range_pool.pop_back(); |
464 | } while( !range_pool.empty() && !ed.context->is_group_execution_cancelled() ); |
465 | } |
466 | } |
467 | template <typename Task> |
468 | bool check_for_demand(Task& t) { |
469 | if ( pass == my_delay ) { |
470 | if ( self().my_divisor > 1 ) // produce affinitized tasks while they have slot in array |
471 | return true; // do not do my_max_depth++ here, but be sure range_pool is splittable once more |
472 | else if ( self().my_divisor && my_max_depth ) { // make balancing task |
473 | self().my_divisor = 0; // once for each task; depth will be decreased in align_depth() |
474 | return true; |
475 | } |
476 | else if ( tree_node::is_peer_stolen(t) ) { |
477 | my_max_depth += __TBB_DEMAND_DEPTH_ADD; |
478 | return true; |
479 | } |
480 | } else if( begin == my_delay ) { |
481 | my_delay = pass; |
482 | } |
483 | return false; |
484 | } |
485 | }; |
486 | |
487 | class auto_partition_type: public dynamic_grainsize_mode<adaptive_mode<auto_partition_type> > { |
488 | public: |
489 | auto_partition_type( const auto_partitioner& ) |
490 | : dynamic_grainsize_mode<adaptive_mode<auto_partition_type> >() { |
491 | my_divisor *= __TBB_INITIAL_CHUNKS; |
492 | } |
493 | auto_partition_type( auto_partition_type& src, split) |
494 | : dynamic_grainsize_mode<adaptive_mode<auto_partition_type> >(src, split()) {} |
495 | bool is_divisible() { // part of old should_execute_range() |
496 | if( my_divisor > 1 ) return true; |
497 | if( my_divisor && my_max_depth ) { // can split the task. TODO: on-stack flag instead |
498 | // keep same fragmentation while splitting for the local task pool |
499 | my_max_depth--; |
500 | my_divisor = 0; // decrease max_depth once per task |
501 | return true; |
502 | } else return false; |
503 | } |
504 | template <typename Task> |
505 | bool check_for_demand(Task& t) { |
506 | if (tree_node::is_peer_stolen(t)) { |
507 | my_max_depth += __TBB_DEMAND_DEPTH_ADD; |
508 | return true; |
509 | } else return false; |
510 | } |
511 | void spawn_task(task& t, task_group_context& ctx) { |
512 | spawn(t, ctx); |
513 | } |
514 | }; |
515 | |
516 | class simple_partition_type: public partition_type_base<simple_partition_type> { |
517 | public: |
518 | simple_partition_type( const simple_partitioner& ) {} |
519 | simple_partition_type( const simple_partition_type&, split ) {} |
520 | //! simplified algorithm |
521 | template<typename StartType, typename Range> |
522 | void execute(StartType &start, Range &range, execution_data& ed) { |
523 | split_type split_obj = split(); // start.offer_work accepts split_type as reference |
524 | while( range.is_divisible() ) |
525 | start.offer_work( split_obj, ed ); |
526 | start.run_body( range ); |
527 | } |
528 | void spawn_task(task& t, task_group_context& ctx) { |
529 | spawn(t, ctx); |
530 | } |
531 | }; |
532 | |
533 | class static_partition_type : public linear_affinity_mode<static_partition_type> { |
534 | public: |
535 | typedef detail::proportional_split split_type; |
536 | static_partition_type( const static_partitioner& ) |
537 | : linear_affinity_mode<static_partition_type>() {} |
538 | static_partition_type( static_partition_type& p, const proportional_split& split_obj ) |
539 | : linear_affinity_mode<static_partition_type>(p, split_obj) {} |
540 | }; |
541 | |
542 | class affinity_partition_type : public dynamic_grainsize_mode<linear_affinity_mode<affinity_partition_type> > { |
543 | static const unsigned factor_power = 4; // TODO: get a unified formula based on number of computing units |
544 | slot_id* my_array; |
545 | public: |
546 | static const unsigned factor = 1 << factor_power; // number of slots in affinity array per task |
547 | typedef detail::proportional_split split_type; |
548 | affinity_partition_type( affinity_partitioner_base& ap ) |
549 | : dynamic_grainsize_mode<linear_affinity_mode<affinity_partition_type> >() { |
550 | __TBB_ASSERT( (factor&(factor-1))==0, "factor must be power of two" ); |
551 | ap.resize(factor); |
552 | my_array = ap.my_array; |
553 | my_max_depth = factor_power + 1; |
554 | __TBB_ASSERT( my_max_depth < __TBB_RANGE_POOL_CAPACITY, 0 ); |
555 | } |
556 | affinity_partition_type(affinity_partition_type& p, split) |
557 | : dynamic_grainsize_mode<linear_affinity_mode<affinity_partition_type> >(p, split()) |
558 | , my_array(p.my_array) {} |
559 | affinity_partition_type(affinity_partition_type& p, const proportional_split& split_obj) |
560 | : dynamic_grainsize_mode<linear_affinity_mode<affinity_partition_type> >(p, split_obj) |
561 | , my_array(p.my_array) {} |
562 | void note_affinity(slot_id id) { |
563 | if( my_divisor ) |
564 | my_array[my_head] = id; |
565 | } |
566 | void spawn_task(task& t, task_group_context& ctx) { |
567 | if (my_divisor) { |
568 | if (!my_array[my_head]) { |
569 | // TODO: consider new ideas with my_array for both affinity and static partitioner's, then code reuse |
570 | spawn(t, ctx, id: slot_id(my_head / factor)); |
571 | } else { |
572 | spawn(t, ctx, id: my_array[my_head]); |
573 | } |
574 | } else { |
575 | spawn(t, ctx); |
576 | } |
577 | } |
578 | }; |
579 | |
580 | //! A simple partitioner |
581 | /** Divides the range until the range is not divisible. |
582 | @ingroup algorithms */ |
583 | class simple_partitioner { |
584 | public: |
585 | simple_partitioner() {} |
586 | private: |
587 | template<typename Range, typename Body, typename Partitioner> friend struct start_for; |
588 | template<typename Range, typename Body, typename Partitioner> friend struct start_reduce; |
589 | template<typename Range, typename Body, typename Partitioner> friend struct start_deterministic_reduce; |
590 | template<typename Range, typename Body, typename Partitioner> friend struct start_scan; |
591 | // new implementation just extends existing interface |
592 | typedef simple_partition_type task_partition_type; |
593 | // TODO: consider to make split_type public |
594 | typedef simple_partition_type::split_type split_type; |
595 | |
596 | // for parallel_scan only |
597 | class partition_type { |
598 | public: |
599 | bool should_execute_range(const execution_data& ) {return false;} |
600 | partition_type( const simple_partitioner& ) {} |
601 | partition_type( const partition_type&, split ) {} |
602 | }; |
603 | }; |
604 | |
605 | //! An auto partitioner |
606 | /** The range is initial divided into several large chunks. |
607 | Chunks are further subdivided into smaller pieces if demand detected and they are divisible. |
608 | @ingroup algorithms */ |
609 | class auto_partitioner { |
610 | public: |
611 | auto_partitioner() {} |
612 | |
613 | private: |
614 | template<typename Range, typename Body, typename Partitioner> friend struct start_for; |
615 | template<typename Range, typename Body, typename Partitioner> friend struct start_reduce; |
616 | template<typename Range, typename Body, typename Partitioner> friend struct start_deterministic_reduce; |
617 | template<typename Range, typename Body, typename Partitioner> friend struct start_scan; |
618 | // new implementation just extends existing interface |
619 | typedef auto_partition_type task_partition_type; |
620 | // TODO: consider to make split_type public |
621 | typedef auto_partition_type::split_type split_type; |
622 | |
623 | //! Backward-compatible partition for auto and affinity partition objects. |
624 | class partition_type { |
625 | size_t num_chunks; |
626 | static const size_t VICTIM_CHUNKS = 4; |
627 | public: |
628 | bool should_execute_range(const execution_data& ed) { |
629 | if( num_chunks<VICTIM_CHUNKS && is_stolen_task(ed) ) |
630 | num_chunks = VICTIM_CHUNKS; |
631 | return num_chunks==1; |
632 | } |
633 | partition_type( const auto_partitioner& ) |
634 | : num_chunks(get_initial_auto_partitioner_divisor()*__TBB_INITIAL_CHUNKS/4) {} |
635 | partition_type( partition_type& pt, split ) { |
636 | num_chunks = pt.num_chunks = (pt.num_chunks+1u) / 2u; |
637 | } |
638 | }; |
639 | }; |
640 | |
641 | //! A static partitioner |
642 | class static_partitioner { |
643 | public: |
644 | static_partitioner() {} |
645 | private: |
646 | template<typename Range, typename Body, typename Partitioner> friend struct start_for; |
647 | template<typename Range, typename Body, typename Partitioner> friend struct start_reduce; |
648 | template<typename Range, typename Body, typename Partitioner> friend struct start_deterministic_reduce; |
649 | template<typename Range, typename Body, typename Partitioner> friend struct start_scan; |
650 | // new implementation just extends existing interface |
651 | typedef static_partition_type task_partition_type; |
652 | // TODO: consider to make split_type public |
653 | typedef static_partition_type::split_type split_type; |
654 | }; |
655 | |
656 | //! An affinity partitioner |
657 | class affinity_partitioner : affinity_partitioner_base { |
658 | public: |
659 | affinity_partitioner() {} |
660 | |
661 | private: |
662 | template<typename Range, typename Body, typename Partitioner> friend struct start_for; |
663 | template<typename Range, typename Body, typename Partitioner> friend struct start_reduce; |
664 | template<typename Range, typename Body, typename Partitioner> friend struct start_deterministic_reduce; |
665 | template<typename Range, typename Body, typename Partitioner> friend struct start_scan; |
666 | // new implementation just extends existing interface |
667 | typedef affinity_partition_type task_partition_type; |
668 | // TODO: consider to make split_type public |
669 | typedef affinity_partition_type::split_type split_type; |
670 | }; |
671 | |
672 | } // namespace d1 |
673 | } // namespace detail |
674 | |
675 | inline namespace v1 { |
676 | // Partitioners |
677 | using detail::d1::auto_partitioner; |
678 | using detail::d1::simple_partitioner; |
679 | using detail::d1::static_partitioner; |
680 | using detail::d1::affinity_partitioner; |
681 | // Split types |
682 | using detail::split; |
683 | using detail::proportional_split; |
684 | } // namespace v1 |
685 | |
686 | } // namespace tbb |
687 | |
688 | #if defined(_MSC_VER) && !defined(__INTEL_COMPILER) |
689 | #pragma warning (pop) |
690 | #endif // warning 4244 is back |
691 | |
692 | #undef __TBB_INITIAL_CHUNKS |
693 | #undef __TBB_RANGE_POOL_CAPACITY |
694 | #undef __TBB_INIT_DEPTH |
695 | |
696 | #endif /* __TBB_partitioner_H */ |
697 | |