| 1 | /* |
| 2 | Copyright (c) 2005-2021 Intel Corporation |
| 3 | |
| 4 | Licensed under the Apache License, Version 2.0 (the "License"); |
| 5 | you may not use this file except in compliance with the License. |
| 6 | You may obtain a copy of the License at |
| 7 | |
| 8 | http://www.apache.org/licenses/LICENSE-2.0 |
| 9 | |
| 10 | Unless required by applicable law or agreed to in writing, software |
| 11 | distributed under the License is distributed on an "AS IS" BASIS, |
| 12 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| 13 | See the License for the specific language governing permissions and |
| 14 | limitations under the License. |
| 15 | */ |
| 16 | |
| 17 | #ifndef __TBB_partitioner_H |
| 18 | #define __TBB_partitioner_H |
| 19 | |
| 20 | #ifndef __TBB_INITIAL_CHUNKS |
| 21 | // initial task divisions per thread |
| 22 | #define __TBB_INITIAL_CHUNKS 2 |
| 23 | #endif |
| 24 | #ifndef __TBB_RANGE_POOL_CAPACITY |
| 25 | // maximum number of elements in range pool |
| 26 | #define __TBB_RANGE_POOL_CAPACITY 8 |
| 27 | #endif |
| 28 | #ifndef __TBB_INIT_DEPTH |
| 29 | // initial value for depth of range pool |
| 30 | #define __TBB_INIT_DEPTH 5 |
| 31 | #endif |
| 32 | #ifndef __TBB_DEMAND_DEPTH_ADD |
| 33 | // when imbalance is found range splits this value times more |
| 34 | #define __TBB_DEMAND_DEPTH_ADD 1 |
| 35 | #endif |
| 36 | |
| 37 | #include "detail/_config.h" |
| 38 | #include "detail/_namespace_injection.h" |
| 39 | #include "detail/_aligned_space.h" |
| 40 | #include "detail/_utils.h" |
| 41 | #include "detail/_template_helpers.h" |
| 42 | #include "detail/_range_common.h" |
| 43 | #include "detail/_task.h" |
| 44 | #include "detail/_small_object_pool.h" |
| 45 | |
| 46 | #include "cache_aligned_allocator.h" |
| 47 | #include "task_group.h" // task_group_context |
| 48 | #include "task_arena.h" |
| 49 | |
| 50 | #include <algorithm> |
| 51 | #include <atomic> |
| 52 | #include <type_traits> |
| 53 | |
| 54 | #if defined(_MSC_VER) && !defined(__INTEL_COMPILER) |
| 55 | // Workaround for overzealous compiler warnings |
| 56 | #pragma warning (push) |
| 57 | #pragma warning (disable: 4244) |
| 58 | #endif |
| 59 | |
| 60 | namespace tbb { |
| 61 | namespace detail { |
| 62 | |
| 63 | namespace d1 { |
| 64 | class auto_partitioner; |
| 65 | class simple_partitioner; |
| 66 | class static_partitioner; |
| 67 | class affinity_partitioner; |
| 68 | class affinity_partition_type; |
| 69 | class affinity_partitioner_base; |
| 70 | |
| 71 | inline std::size_t get_initial_auto_partitioner_divisor() { |
| 72 | const std::size_t factor = 4; |
| 73 | return factor * max_concurrency(); |
| 74 | } |
| 75 | |
| 76 | //! Defines entry point for affinity partitioner into oneTBB run-time library. |
| 77 | class affinity_partitioner_base: no_copy { |
| 78 | friend class affinity_partitioner; |
| 79 | friend class affinity_partition_type; |
| 80 | //! Array that remembers affinities of tree positions to affinity_id. |
| 81 | /** NULL if my_size==0. */ |
| 82 | slot_id* my_array; |
| 83 | //! Number of elements in my_array. |
| 84 | std::size_t my_size; |
| 85 | //! Zeros the fields. |
| 86 | affinity_partitioner_base() : my_array(nullptr), my_size(0) {} |
| 87 | //! Deallocates my_array. |
| 88 | ~affinity_partitioner_base() { resize(factor: 0); } |
| 89 | //! Resize my_array. |
| 90 | /** Retains values if resulting size is the same. */ |
| 91 | void resize(unsigned factor) { |
| 92 | // Check factor to avoid asking for number of workers while there might be no arena. |
| 93 | unsigned max_threads_in_arena = max_concurrency(); |
| 94 | std::size_t new_size = factor ? factor * max_threads_in_arena : 0; |
| 95 | if (new_size != my_size) { |
| 96 | if (my_array) { |
| 97 | r1::cache_aligned_deallocate(p: my_array); |
| 98 | // Following two assignments must be done here for sake of exception safety. |
| 99 | my_array = nullptr; |
| 100 | my_size = 0; |
| 101 | } |
| 102 | if (new_size) { |
| 103 | my_array = static_cast<slot_id*>(r1::cache_aligned_allocate(size: new_size * sizeof(slot_id))); |
| 104 | std::fill_n(first: my_array, n: new_size, value: no_slot); |
| 105 | my_size = new_size; |
| 106 | } |
| 107 | } |
| 108 | } |
| 109 | }; |
| 110 | |
| 111 | template<typename Range, typename Body, typename Partitioner> struct start_for; |
| 112 | template<typename Range, typename Body, typename Partitioner> struct start_scan; |
| 113 | template<typename Range, typename Body, typename Partitioner> struct start_reduce; |
| 114 | template<typename Range, typename Body, typename Partitioner> struct start_deterministic_reduce; |
| 115 | |
| 116 | struct node { |
| 117 | node* my_parent{}; |
| 118 | std::atomic<int> m_ref_count{}; |
| 119 | |
| 120 | node() = default; |
| 121 | node(node* parent, int ref_count) : |
| 122 | my_parent{parent}, m_ref_count{ref_count} { |
| 123 | __TBB_ASSERT(ref_count > 0, "The ref count must be positive" ); |
| 124 | } |
| 125 | }; |
| 126 | |
| 127 | struct wait_node : node { |
| 128 | wait_node() : node{ nullptr, 1 } {} |
| 129 | wait_context m_wait{1}; |
| 130 | }; |
| 131 | |
| 132 | //! Join task node that contains shared flag for stealing feedback |
| 133 | struct tree_node : public node { |
| 134 | small_object_allocator m_allocator; |
| 135 | std::atomic<bool> m_child_stolen{false}; |
| 136 | |
| 137 | tree_node(node* parent, int ref_count, small_object_allocator& alloc) |
| 138 | : node{parent, ref_count} |
| 139 | , m_allocator{alloc} {} |
| 140 | |
| 141 | void join(task_group_context*) {/*dummy, required only for reduction algorithms*/}; |
| 142 | |
| 143 | template <typename Task> |
| 144 | static void mark_task_stolen(Task &t) { |
| 145 | std::atomic<bool> &flag = static_cast<tree_node*>(t.my_parent)->m_child_stolen; |
| 146 | #if TBB_USE_PROFILING_TOOLS |
| 147 | // Threading tools respect lock prefix but report false-positive data-race via plain store |
| 148 | flag.exchange(i: true); |
| 149 | #else |
| 150 | flag.store(true, std::memory_order_relaxed); |
| 151 | #endif // TBB_USE_PROFILING_TOOLS |
| 152 | } |
| 153 | template <typename Task> |
| 154 | static bool is_peer_stolen(Task &t) { |
| 155 | return static_cast<tree_node*>(t.my_parent)->m_child_stolen.load(m: std::memory_order_relaxed); |
| 156 | } |
| 157 | }; |
| 158 | |
| 159 | // Context used to check cancellation state during reduction join process |
| 160 | template<typename TreeNodeType> |
| 161 | void fold_tree(node* n, const execution_data& ed) { |
| 162 | for (;;) { |
| 163 | __TBB_ASSERT(n->m_ref_count.load(std::memory_order_relaxed) > 0, "The refcount must be positive." ); |
| 164 | call_itt_task_notify(t: releasing, ptr: n); |
| 165 | if (--n->m_ref_count > 0) { |
| 166 | return; |
| 167 | } |
| 168 | node* parent = n->my_parent; |
| 169 | if (!parent) { |
| 170 | break; |
| 171 | }; |
| 172 | |
| 173 | call_itt_task_notify(t: acquired, ptr: n); |
| 174 | TreeNodeType* self = static_cast<TreeNodeType*>(n); |
| 175 | self->join(ed.context); |
| 176 | self->m_allocator.delete_object(self, ed); |
| 177 | n = parent; |
| 178 | } |
| 179 | // Finish parallel for execution when the root (last node) is reached |
| 180 | static_cast<wait_node*>(n)->m_wait.release(); |
| 181 | } |
| 182 | |
| 183 | //! Depth is a relative depth of recursive division inside a range pool. Relative depth allows |
| 184 | //! infinite absolute depth of the recursion for heavily unbalanced workloads with range represented |
| 185 | //! by a number that cannot fit into machine word. |
| 186 | typedef unsigned char depth_t; |
| 187 | |
| 188 | //! Range pool stores ranges of type T in a circular buffer with MaxCapacity |
| 189 | template <typename T, depth_t MaxCapacity> |
| 190 | class range_vector { |
| 191 | depth_t my_head; |
| 192 | depth_t my_tail; |
| 193 | depth_t my_size; |
| 194 | depth_t my_depth[MaxCapacity]; // relative depths of stored ranges |
| 195 | tbb::detail::aligned_space<T, MaxCapacity> my_pool; |
| 196 | |
| 197 | public: |
| 198 | //! initialize via first range in pool |
| 199 | range_vector(const T& elem) : my_head(0), my_tail(0), my_size(1) { |
| 200 | my_depth[0] = 0; |
| 201 | new( static_cast<void *>(my_pool.begin()) ) T(elem);//TODO: std::move? |
| 202 | } |
| 203 | ~range_vector() { |
| 204 | while( !empty() ) pop_back(); |
| 205 | } |
| 206 | bool empty() const { return my_size == 0; } |
| 207 | depth_t size() const { return my_size; } |
| 208 | //! Populates range pool via ranges up to max depth or while divisible |
| 209 | //! max_depth starts from 0, e.g. value 2 makes 3 ranges in the pool up to two 1/4 pieces |
| 210 | void split_to_fill(depth_t max_depth) { |
| 211 | while( my_size < MaxCapacity && is_divisible(max_depth) ) { |
| 212 | depth_t prev = my_head; |
| 213 | my_head = (my_head + 1) % MaxCapacity; |
| 214 | new(my_pool.begin()+my_head) T(my_pool.begin()[prev]); // copy TODO: std::move? |
| 215 | my_pool.begin()[prev].~T(); // instead of assignment |
| 216 | new(my_pool.begin()+prev) T(my_pool.begin()[my_head], detail::split()); // do 'inverse' split |
| 217 | my_depth[my_head] = ++my_depth[prev]; |
| 218 | my_size++; |
| 219 | } |
| 220 | } |
| 221 | void pop_back() { |
| 222 | __TBB_ASSERT(my_size > 0, "range_vector::pop_back() with empty size" ); |
| 223 | my_pool.begin()[my_head].~T(); |
| 224 | my_size--; |
| 225 | my_head = (my_head + MaxCapacity - 1) % MaxCapacity; |
| 226 | } |
| 227 | void pop_front() { |
| 228 | __TBB_ASSERT(my_size > 0, "range_vector::pop_front() with empty size" ); |
| 229 | my_pool.begin()[my_tail].~T(); |
| 230 | my_size--; |
| 231 | my_tail = (my_tail + 1) % MaxCapacity; |
| 232 | } |
| 233 | T& back() { |
| 234 | __TBB_ASSERT(my_size > 0, "range_vector::back() with empty size" ); |
| 235 | return my_pool.begin()[my_head]; |
| 236 | } |
| 237 | T& front() { |
| 238 | __TBB_ASSERT(my_size > 0, "range_vector::front() with empty size" ); |
| 239 | return my_pool.begin()[my_tail]; |
| 240 | } |
| 241 | //! similarly to front(), returns depth of the first range in the pool |
| 242 | depth_t front_depth() { |
| 243 | __TBB_ASSERT(my_size > 0, "range_vector::front_depth() with empty size" ); |
| 244 | return my_depth[my_tail]; |
| 245 | } |
| 246 | depth_t back_depth() { |
| 247 | __TBB_ASSERT(my_size > 0, "range_vector::back_depth() with empty size" ); |
| 248 | return my_depth[my_head]; |
| 249 | } |
| 250 | bool is_divisible(depth_t max_depth) { |
| 251 | return back_depth() < max_depth && back().is_divisible(); |
| 252 | } |
| 253 | }; |
| 254 | |
| 255 | //! Provides default methods for partition objects and common algorithm blocks. |
| 256 | template <typename Partition> |
| 257 | struct partition_type_base { |
| 258 | typedef detail::split split_type; |
| 259 | // decision makers |
| 260 | void note_affinity( slot_id ) {} |
| 261 | template <typename Task> |
| 262 | bool check_being_stolen(Task&, const execution_data&) { return false; } // part of old should_execute_range() |
| 263 | template <typename Range> split_type get_split() { return split(); } |
| 264 | Partition& self() { return *static_cast<Partition*>(this); } // CRTP helper |
| 265 | |
| 266 | template<typename StartType, typename Range> |
| 267 | void work_balance(StartType &start, Range &range, const execution_data&) { |
| 268 | start.run_body( range ); // static partitioner goes here |
| 269 | } |
| 270 | |
| 271 | template<typename StartType, typename Range> |
| 272 | void execute(StartType &start, Range &range, execution_data& ed) { |
| 273 | // The algorithm in a few words ([]-denotes calls to decision methods of partitioner): |
| 274 | // [If this task is stolen, adjust depth and divisions if necessary, set flag]. |
| 275 | // If range is divisible { |
| 276 | // Spread the work while [initial divisions left]; |
| 277 | // Create trap task [if necessary]; |
| 278 | // } |
| 279 | // If not divisible or [max depth is reached], execute, else do the range pool part |
| 280 | if ( range.is_divisible() ) { |
| 281 | if ( self().is_divisible() ) { |
| 282 | do { // split until is divisible |
| 283 | typename Partition::split_type split_obj = self().template get_split<Range>(); |
| 284 | start.offer_work( split_obj, ed ); |
| 285 | } while ( range.is_divisible() && self().is_divisible() ); |
| 286 | } |
| 287 | } |
| 288 | self().work_balance(start, range, ed); |
| 289 | } |
| 290 | }; |
| 291 | |
| 292 | //! Provides default splitting strategy for partition objects. |
| 293 | template <typename Partition> |
| 294 | struct adaptive_mode : partition_type_base<Partition> { |
| 295 | typedef Partition my_partition; |
| 296 | std::size_t my_divisor; |
| 297 | // For affinity_partitioner, my_divisor indicates the number of affinity array indices the task reserves. |
| 298 | // A task which has only one index must produce the right split without reserved index in order to avoid |
| 299 | // it to be overwritten in note_affinity() of the created (right) task. |
| 300 | // I.e. a task created deeper than the affinity array can remember must not save its affinity (LIFO order) |
| 301 | static const unsigned factor = 1; |
| 302 | adaptive_mode() : my_divisor(get_initial_auto_partitioner_divisor() / 4 * my_partition::factor) {} |
| 303 | adaptive_mode(adaptive_mode &src, split) : my_divisor(do_split(src, split())) {} |
| 304 | adaptive_mode(adaptive_mode&, const proportional_split&) : my_divisor(0) |
| 305 | { |
| 306 | // left blank as my_divisor gets overridden in the successors' constructors |
| 307 | } |
| 308 | /*! Override do_split methods in order to specify splitting strategy */ |
| 309 | std::size_t do_split(adaptive_mode &src, split) { |
| 310 | return src.my_divisor /= 2u; |
| 311 | } |
| 312 | }; |
| 313 | |
| 314 | //! Helper type for checking availability of proportional_split constructor |
| 315 | template <typename T> using supports_proportional_splitting = typename std::is_constructible<T, T&, proportional_split&>; |
| 316 | |
| 317 | //! A helper class to create a proportional_split object for a given type of Range. |
| 318 | /** If the Range has proportional_split constructor, |
| 319 | then created object splits a provided value in an implemenation-defined proportion; |
| 320 | otherwise it represents equal-size split. */ |
| 321 | // TODO: check if this helper can be a nested class of proportional_mode. |
| 322 | template <typename Range, typename = void> |
| 323 | struct proportion_helper { |
| 324 | static proportional_split get_split(std::size_t) { return proportional_split(1,1); } |
| 325 | }; |
| 326 | |
| 327 | template <typename Range> |
| 328 | struct proportion_helper<Range, typename std::enable_if<supports_proportional_splitting<Range>::value>::type> { |
| 329 | static proportional_split get_split(std::size_t n) { |
| 330 | std::size_t right = n / 2; |
| 331 | std::size_t left = n - right; |
| 332 | return proportional_split(left, right); |
| 333 | } |
| 334 | }; |
| 335 | |
| 336 | //! Provides proportional splitting strategy for partition objects |
| 337 | template <typename Partition> |
| 338 | struct proportional_mode : adaptive_mode<Partition> { |
| 339 | typedef Partition my_partition; |
| 340 | using partition_type_base<Partition>::self; // CRTP helper to get access to derived classes |
| 341 | |
| 342 | proportional_mode() : adaptive_mode<Partition>() {} |
| 343 | proportional_mode(proportional_mode &src, split) : adaptive_mode<Partition>(src, split()) {} |
| 344 | proportional_mode(proportional_mode &src, const proportional_split& split_obj) |
| 345 | : adaptive_mode<Partition>(src, split_obj) |
| 346 | { |
| 347 | self().my_divisor = do_split(src, split_obj); |
| 348 | } |
| 349 | std::size_t do_split(proportional_mode &src, const proportional_split& split_obj) { |
| 350 | std::size_t portion = split_obj.right() * my_partition::factor; |
| 351 | portion = (portion + my_partition::factor/2) & (0ul - my_partition::factor); |
| 352 | src.my_divisor -= portion; |
| 353 | return portion; |
| 354 | } |
| 355 | bool is_divisible() { // part of old should_execute_range() |
| 356 | return self().my_divisor > my_partition::factor; |
| 357 | } |
| 358 | template <typename Range> |
| 359 | proportional_split get_split() { |
| 360 | // Create a proportion for the number of threads expected to handle "this" subrange |
| 361 | return proportion_helper<Range>::get_split( self().my_divisor / my_partition::factor ); |
| 362 | } |
| 363 | }; |
| 364 | |
| 365 | static std::size_t get_initial_partition_head() { |
| 366 | int current_index = tbb::this_task_arena::current_thread_index(); |
| 367 | if (current_index == tbb::task_arena::not_initialized) |
| 368 | current_index = 0; |
| 369 | return size_t(current_index); |
| 370 | } |
| 371 | |
| 372 | //! Provides default linear indexing of partitioner's sequence |
| 373 | template <typename Partition> |
| 374 | struct linear_affinity_mode : proportional_mode<Partition> { |
| 375 | std::size_t my_head; |
| 376 | std::size_t my_max_affinity; |
| 377 | using proportional_mode<Partition>::self; |
| 378 | linear_affinity_mode() : proportional_mode<Partition>(), my_head(get_initial_partition_head()), |
| 379 | my_max_affinity(self().my_divisor) {} |
| 380 | linear_affinity_mode(linear_affinity_mode &src, split) : proportional_mode<Partition>(src, split()) |
| 381 | , my_head((src.my_head + src.my_divisor) % src.my_max_affinity), my_max_affinity(src.my_max_affinity) {} |
| 382 | linear_affinity_mode(linear_affinity_mode &src, const proportional_split& split_obj) : proportional_mode<Partition>(src, split_obj) |
| 383 | , my_head((src.my_head + src.my_divisor) % src.my_max_affinity), my_max_affinity(src.my_max_affinity) {} |
| 384 | void spawn_task(task& t, task_group_context& ctx) { |
| 385 | if (self().my_divisor) { |
| 386 | spawn(t, ctx, id: slot_id(my_head)); |
| 387 | } else { |
| 388 | spawn(t, ctx); |
| 389 | } |
| 390 | } |
| 391 | }; |
| 392 | |
| 393 | static bool is_stolen_task(const execution_data& ed) { |
| 394 | return execution_slot(ed) != original_slot(ed); |
| 395 | } |
| 396 | |
| 397 | /*! Determine work-balance phase implementing splitting & stealing actions */ |
| 398 | template<class Mode> |
| 399 | struct dynamic_grainsize_mode : Mode { |
| 400 | using Mode::self; |
| 401 | enum { |
| 402 | begin = 0, |
| 403 | run, |
| 404 | pass |
| 405 | } my_delay; |
| 406 | depth_t my_max_depth; |
| 407 | static const unsigned range_pool_size = __TBB_RANGE_POOL_CAPACITY; |
| 408 | dynamic_grainsize_mode(): Mode() |
| 409 | , my_delay(begin) |
| 410 | , my_max_depth(__TBB_INIT_DEPTH) {} |
| 411 | dynamic_grainsize_mode(dynamic_grainsize_mode& p, split) |
| 412 | : Mode(p, split()) |
| 413 | , my_delay(pass) |
| 414 | , my_max_depth(p.my_max_depth) {} |
| 415 | dynamic_grainsize_mode(dynamic_grainsize_mode& p, const proportional_split& split_obj) |
| 416 | : Mode(p, split_obj) |
| 417 | , my_delay(begin) |
| 418 | , my_max_depth(p.my_max_depth) {} |
| 419 | template <typename Task> |
| 420 | bool check_being_stolen(Task &t, const execution_data& ed) { // part of old should_execute_range() |
| 421 | if( !(self().my_divisor / Mode::my_partition::factor) ) { // if not from the top P tasks of binary tree |
| 422 | self().my_divisor = 1; // TODO: replace by on-stack flag (partition_state's member)? |
| 423 | if( is_stolen_task(ed) && t.my_parent->m_ref_count >= 2 ) { // runs concurrently with the left task |
| 424 | #if __TBB_USE_OPTIONAL_RTTI |
| 425 | // RTTI is available, check whether the cast is valid |
| 426 | // TODO: TBB_REVAMP_TODO __TBB_ASSERT(dynamic_cast<tree_node*>(t.m_parent), 0); |
| 427 | // correctness of the cast relies on avoiding the root task for which: |
| 428 | // - initial value of my_divisor != 0 (protected by separate assertion) |
| 429 | // - is_stolen_task() always returns false for the root task. |
| 430 | #endif |
| 431 | tree_node::mark_task_stolen(t); |
| 432 | if( !my_max_depth ) my_max_depth++; |
| 433 | my_max_depth += __TBB_DEMAND_DEPTH_ADD; |
| 434 | return true; |
| 435 | } |
| 436 | } |
| 437 | return false; |
| 438 | } |
| 439 | depth_t max_depth() { return my_max_depth; } |
| 440 | void align_depth(depth_t base) { |
| 441 | __TBB_ASSERT(base <= my_max_depth, 0); |
| 442 | my_max_depth -= base; |
| 443 | } |
| 444 | template<typename StartType, typename Range> |
| 445 | void work_balance(StartType &start, Range &range, execution_data& ed) { |
| 446 | if( !range.is_divisible() || !self().max_depth() ) { |
| 447 | start.run_body( range ); |
| 448 | } |
| 449 | else { // do range pool |
| 450 | range_vector<Range, range_pool_size> range_pool(range); |
| 451 | do { |
| 452 | range_pool.split_to_fill(self().max_depth()); // fill range pool |
| 453 | if( self().check_for_demand( start ) ) { |
| 454 | if( range_pool.size() > 1 ) { |
| 455 | start.offer_work( range_pool.front(), range_pool.front_depth(), ed ); |
| 456 | range_pool.pop_front(); |
| 457 | continue; |
| 458 | } |
| 459 | if( range_pool.is_divisible(self().max_depth()) ) // was not enough depth to fork a task |
| 460 | continue; // note: next split_to_fill() should split range at least once |
| 461 | } |
| 462 | start.run_body( range_pool.back() ); |
| 463 | range_pool.pop_back(); |
| 464 | } while( !range_pool.empty() && !ed.context->is_group_execution_cancelled() ); |
| 465 | } |
| 466 | } |
| 467 | template <typename Task> |
| 468 | bool check_for_demand(Task& t) { |
| 469 | if ( pass == my_delay ) { |
| 470 | if ( self().my_divisor > 1 ) // produce affinitized tasks while they have slot in array |
| 471 | return true; // do not do my_max_depth++ here, but be sure range_pool is splittable once more |
| 472 | else if ( self().my_divisor && my_max_depth ) { // make balancing task |
| 473 | self().my_divisor = 0; // once for each task; depth will be decreased in align_depth() |
| 474 | return true; |
| 475 | } |
| 476 | else if ( tree_node::is_peer_stolen(t) ) { |
| 477 | my_max_depth += __TBB_DEMAND_DEPTH_ADD; |
| 478 | return true; |
| 479 | } |
| 480 | } else if( begin == my_delay ) { |
| 481 | my_delay = pass; |
| 482 | } |
| 483 | return false; |
| 484 | } |
| 485 | }; |
| 486 | |
| 487 | class auto_partition_type: public dynamic_grainsize_mode<adaptive_mode<auto_partition_type> > { |
| 488 | public: |
| 489 | auto_partition_type( const auto_partitioner& ) |
| 490 | : dynamic_grainsize_mode<adaptive_mode<auto_partition_type> >() { |
| 491 | my_divisor *= __TBB_INITIAL_CHUNKS; |
| 492 | } |
| 493 | auto_partition_type( auto_partition_type& src, split) |
| 494 | : dynamic_grainsize_mode<adaptive_mode<auto_partition_type> >(src, split()) {} |
| 495 | bool is_divisible() { // part of old should_execute_range() |
| 496 | if( my_divisor > 1 ) return true; |
| 497 | if( my_divisor && my_max_depth ) { // can split the task. TODO: on-stack flag instead |
| 498 | // keep same fragmentation while splitting for the local task pool |
| 499 | my_max_depth--; |
| 500 | my_divisor = 0; // decrease max_depth once per task |
| 501 | return true; |
| 502 | } else return false; |
| 503 | } |
| 504 | template <typename Task> |
| 505 | bool check_for_demand(Task& t) { |
| 506 | if (tree_node::is_peer_stolen(t)) { |
| 507 | my_max_depth += __TBB_DEMAND_DEPTH_ADD; |
| 508 | return true; |
| 509 | } else return false; |
| 510 | } |
| 511 | void spawn_task(task& t, task_group_context& ctx) { |
| 512 | spawn(t, ctx); |
| 513 | } |
| 514 | }; |
| 515 | |
| 516 | class simple_partition_type: public partition_type_base<simple_partition_type> { |
| 517 | public: |
| 518 | simple_partition_type( const simple_partitioner& ) {} |
| 519 | simple_partition_type( const simple_partition_type&, split ) {} |
| 520 | //! simplified algorithm |
| 521 | template<typename StartType, typename Range> |
| 522 | void execute(StartType &start, Range &range, execution_data& ed) { |
| 523 | split_type split_obj = split(); // start.offer_work accepts split_type as reference |
| 524 | while( range.is_divisible() ) |
| 525 | start.offer_work( split_obj, ed ); |
| 526 | start.run_body( range ); |
| 527 | } |
| 528 | void spawn_task(task& t, task_group_context& ctx) { |
| 529 | spawn(t, ctx); |
| 530 | } |
| 531 | }; |
| 532 | |
| 533 | class static_partition_type : public linear_affinity_mode<static_partition_type> { |
| 534 | public: |
| 535 | typedef detail::proportional_split split_type; |
| 536 | static_partition_type( const static_partitioner& ) |
| 537 | : linear_affinity_mode<static_partition_type>() {} |
| 538 | static_partition_type( static_partition_type& p, const proportional_split& split_obj ) |
| 539 | : linear_affinity_mode<static_partition_type>(p, split_obj) {} |
| 540 | }; |
| 541 | |
| 542 | class affinity_partition_type : public dynamic_grainsize_mode<linear_affinity_mode<affinity_partition_type> > { |
| 543 | static const unsigned factor_power = 4; // TODO: get a unified formula based on number of computing units |
| 544 | slot_id* my_array; |
| 545 | public: |
| 546 | static const unsigned factor = 1 << factor_power; // number of slots in affinity array per task |
| 547 | typedef detail::proportional_split split_type; |
| 548 | affinity_partition_type( affinity_partitioner_base& ap ) |
| 549 | : dynamic_grainsize_mode<linear_affinity_mode<affinity_partition_type> >() { |
| 550 | __TBB_ASSERT( (factor&(factor-1))==0, "factor must be power of two" ); |
| 551 | ap.resize(factor); |
| 552 | my_array = ap.my_array; |
| 553 | my_max_depth = factor_power + 1; |
| 554 | __TBB_ASSERT( my_max_depth < __TBB_RANGE_POOL_CAPACITY, 0 ); |
| 555 | } |
| 556 | affinity_partition_type(affinity_partition_type& p, split) |
| 557 | : dynamic_grainsize_mode<linear_affinity_mode<affinity_partition_type> >(p, split()) |
| 558 | , my_array(p.my_array) {} |
| 559 | affinity_partition_type(affinity_partition_type& p, const proportional_split& split_obj) |
| 560 | : dynamic_grainsize_mode<linear_affinity_mode<affinity_partition_type> >(p, split_obj) |
| 561 | , my_array(p.my_array) {} |
| 562 | void note_affinity(slot_id id) { |
| 563 | if( my_divisor ) |
| 564 | my_array[my_head] = id; |
| 565 | } |
| 566 | void spawn_task(task& t, task_group_context& ctx) { |
| 567 | if (my_divisor) { |
| 568 | if (!my_array[my_head]) { |
| 569 | // TODO: consider new ideas with my_array for both affinity and static partitioner's, then code reuse |
| 570 | spawn(t, ctx, id: slot_id(my_head / factor)); |
| 571 | } else { |
| 572 | spawn(t, ctx, id: my_array[my_head]); |
| 573 | } |
| 574 | } else { |
| 575 | spawn(t, ctx); |
| 576 | } |
| 577 | } |
| 578 | }; |
| 579 | |
| 580 | //! A simple partitioner |
| 581 | /** Divides the range until the range is not divisible. |
| 582 | @ingroup algorithms */ |
| 583 | class simple_partitioner { |
| 584 | public: |
| 585 | simple_partitioner() {} |
| 586 | private: |
| 587 | template<typename Range, typename Body, typename Partitioner> friend struct start_for; |
| 588 | template<typename Range, typename Body, typename Partitioner> friend struct start_reduce; |
| 589 | template<typename Range, typename Body, typename Partitioner> friend struct start_deterministic_reduce; |
| 590 | template<typename Range, typename Body, typename Partitioner> friend struct start_scan; |
| 591 | // new implementation just extends existing interface |
| 592 | typedef simple_partition_type task_partition_type; |
| 593 | // TODO: consider to make split_type public |
| 594 | typedef simple_partition_type::split_type split_type; |
| 595 | |
| 596 | // for parallel_scan only |
| 597 | class partition_type { |
| 598 | public: |
| 599 | bool should_execute_range(const execution_data& ) {return false;} |
| 600 | partition_type( const simple_partitioner& ) {} |
| 601 | partition_type( const partition_type&, split ) {} |
| 602 | }; |
| 603 | }; |
| 604 | |
| 605 | //! An auto partitioner |
| 606 | /** The range is initial divided into several large chunks. |
| 607 | Chunks are further subdivided into smaller pieces if demand detected and they are divisible. |
| 608 | @ingroup algorithms */ |
| 609 | class auto_partitioner { |
| 610 | public: |
| 611 | auto_partitioner() {} |
| 612 | |
| 613 | private: |
| 614 | template<typename Range, typename Body, typename Partitioner> friend struct start_for; |
| 615 | template<typename Range, typename Body, typename Partitioner> friend struct start_reduce; |
| 616 | template<typename Range, typename Body, typename Partitioner> friend struct start_deterministic_reduce; |
| 617 | template<typename Range, typename Body, typename Partitioner> friend struct start_scan; |
| 618 | // new implementation just extends existing interface |
| 619 | typedef auto_partition_type task_partition_type; |
| 620 | // TODO: consider to make split_type public |
| 621 | typedef auto_partition_type::split_type split_type; |
| 622 | |
| 623 | //! Backward-compatible partition for auto and affinity partition objects. |
| 624 | class partition_type { |
| 625 | size_t num_chunks; |
| 626 | static const size_t VICTIM_CHUNKS = 4; |
| 627 | public: |
| 628 | bool should_execute_range(const execution_data& ed) { |
| 629 | if( num_chunks<VICTIM_CHUNKS && is_stolen_task(ed) ) |
| 630 | num_chunks = VICTIM_CHUNKS; |
| 631 | return num_chunks==1; |
| 632 | } |
| 633 | partition_type( const auto_partitioner& ) |
| 634 | : num_chunks(get_initial_auto_partitioner_divisor()*__TBB_INITIAL_CHUNKS/4) {} |
| 635 | partition_type( partition_type& pt, split ) { |
| 636 | num_chunks = pt.num_chunks = (pt.num_chunks+1u) / 2u; |
| 637 | } |
| 638 | }; |
| 639 | }; |
| 640 | |
| 641 | //! A static partitioner |
| 642 | class static_partitioner { |
| 643 | public: |
| 644 | static_partitioner() {} |
| 645 | private: |
| 646 | template<typename Range, typename Body, typename Partitioner> friend struct start_for; |
| 647 | template<typename Range, typename Body, typename Partitioner> friend struct start_reduce; |
| 648 | template<typename Range, typename Body, typename Partitioner> friend struct start_deterministic_reduce; |
| 649 | template<typename Range, typename Body, typename Partitioner> friend struct start_scan; |
| 650 | // new implementation just extends existing interface |
| 651 | typedef static_partition_type task_partition_type; |
| 652 | // TODO: consider to make split_type public |
| 653 | typedef static_partition_type::split_type split_type; |
| 654 | }; |
| 655 | |
| 656 | //! An affinity partitioner |
| 657 | class affinity_partitioner : affinity_partitioner_base { |
| 658 | public: |
| 659 | affinity_partitioner() {} |
| 660 | |
| 661 | private: |
| 662 | template<typename Range, typename Body, typename Partitioner> friend struct start_for; |
| 663 | template<typename Range, typename Body, typename Partitioner> friend struct start_reduce; |
| 664 | template<typename Range, typename Body, typename Partitioner> friend struct start_deterministic_reduce; |
| 665 | template<typename Range, typename Body, typename Partitioner> friend struct start_scan; |
| 666 | // new implementation just extends existing interface |
| 667 | typedef affinity_partition_type task_partition_type; |
| 668 | // TODO: consider to make split_type public |
| 669 | typedef affinity_partition_type::split_type split_type; |
| 670 | }; |
| 671 | |
| 672 | } // namespace d1 |
| 673 | } // namespace detail |
| 674 | |
| 675 | inline namespace v1 { |
| 676 | // Partitioners |
| 677 | using detail::d1::auto_partitioner; |
| 678 | using detail::d1::simple_partitioner; |
| 679 | using detail::d1::static_partitioner; |
| 680 | using detail::d1::affinity_partitioner; |
| 681 | // Split types |
| 682 | using detail::split; |
| 683 | using detail::proportional_split; |
| 684 | } // namespace v1 |
| 685 | |
| 686 | } // namespace tbb |
| 687 | |
| 688 | #if defined(_MSC_VER) && !defined(__INTEL_COMPILER) |
| 689 | #pragma warning (pop) |
| 690 | #endif // warning 4244 is back |
| 691 | |
| 692 | #undef __TBB_INITIAL_CHUNKS |
| 693 | #undef __TBB_RANGE_POOL_CAPACITY |
| 694 | #undef __TBB_INIT_DEPTH |
| 695 | |
| 696 | #endif /* __TBB_partitioner_H */ |
| 697 | |