| 1 | /* |
| 2 | * kmp_dispatch.h: dynamic scheduling - iteration initialization and dispatch. |
| 3 | */ |
| 4 | |
| 5 | //===----------------------------------------------------------------------===// |
| 6 | // |
| 7 | // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. |
| 8 | // See https://llvm.org/LICENSE.txt for license information. |
| 9 | // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception |
| 10 | // |
| 11 | //===----------------------------------------------------------------------===// |
| 12 | |
| 13 | #ifndef KMP_DISPATCH_H |
| 14 | #define KMP_DISPATCH_H |
| 15 | |
| 16 | /* ------------------------------------------------------------------------ */ |
| 17 | /* ------------------------------------------------------------------------ */ |
| 18 | |
| 19 | #include "kmp.h" |
| 20 | #include "kmp_error.h" |
| 21 | #include "kmp_i18n.h" |
| 22 | #include "kmp_itt.h" |
| 23 | #include "kmp_stats.h" |
| 24 | #include "kmp_str.h" |
| 25 | #if KMP_OS_WINDOWS && KMP_ARCH_X86 |
| 26 | #include <float.h> |
| 27 | #endif |
| 28 | |
| 29 | #if OMPT_SUPPORT |
| 30 | #include "ompt-internal.h" |
| 31 | #include "ompt-specific.h" |
| 32 | #endif |
| 33 | |
| 34 | /* ------------------------------------------------------------------------ */ |
| 35 | /* ------------------------------------------------------------------------ */ |
| 36 | #if KMP_USE_HIER_SCHED |
| 37 | // Forward declarations of some hierarchical scheduling data structures |
| 38 | template <typename T> struct kmp_hier_t; |
| 39 | template <typename T> struct kmp_hier_top_unit_t; |
| 40 | #endif // KMP_USE_HIER_SCHED |
| 41 | |
| 42 | template <typename T> struct dispatch_shared_info_template; |
| 43 | template <typename T> struct dispatch_private_info_template; |
| 44 | |
| 45 | template <typename T> |
| 46 | extern void __kmp_dispatch_init_algorithm(ident_t *loc, int gtid, |
| 47 | dispatch_private_info_template<T> *pr, |
| 48 | enum sched_type schedule, T lb, T ub, |
| 49 | typename traits_t<T>::signed_t st, |
| 50 | #if USE_ITT_BUILD |
| 51 | kmp_uint64 *cur_chunk, |
| 52 | #endif |
| 53 | typename traits_t<T>::signed_t chunk, |
| 54 | T nproc, T unit_id); |
| 55 | template <typename T> |
| 56 | extern int __kmp_dispatch_next_algorithm( |
| 57 | int gtid, dispatch_private_info_template<T> *pr, |
| 58 | dispatch_shared_info_template<T> volatile *sh, kmp_int32 *p_last, T *p_lb, |
| 59 | T *p_ub, typename traits_t<T>::signed_t *p_st, T nproc, T unit_id); |
| 60 | |
| 61 | void __kmp_dispatch_dxo_error(int *gtid_ref, int *cid_ref, ident_t *loc_ref); |
| 62 | void __kmp_dispatch_deo_error(int *gtid_ref, int *cid_ref, ident_t *loc_ref); |
| 63 | |
| 64 | #if KMP_STATIC_STEAL_ENABLED |
| 65 | |
| 66 | // replaces dispatch_private_info{32,64} structures and |
| 67 | // dispatch_private_info{32,64}_t types |
| 68 | template <typename T> struct dispatch_private_infoXX_template { |
| 69 | typedef typename traits_t<T>::unsigned_t UT; |
| 70 | typedef typename traits_t<T>::signed_t ST; |
| 71 | UT count; // unsigned |
| 72 | T ub; |
| 73 | /* Adding KMP_ALIGN_CACHE here doesn't help / can hurt performance */ |
| 74 | T lb; |
| 75 | ST st; // signed |
| 76 | UT tc; // unsigned |
| 77 | kmp_lock_t *steal_lock; // lock used for chunk stealing |
| 78 | |
| 79 | UT ordered_lower; // unsigned |
| 80 | UT ordered_upper; // unsigned |
| 81 | |
| 82 | /* parm[1-4] are used in different ways by different scheduling algorithms */ |
| 83 | |
| 84 | // KMP_ALIGN(32) ensures ( if the KMP_ALIGN macro is turned on ) |
| 85 | // a) parm3 is properly aligned and |
| 86 | // b) all parm1-4 are in the same cache line. |
| 87 | // Because of parm1-4 are used together, performance seems to be better |
| 88 | // if they are in the same line (not measured though). |
| 89 | struct KMP_ALIGN(32) { // compiler does not accept sizeof(T)*4 |
| 90 | T parm1; |
| 91 | T parm2; |
| 92 | T parm3; |
| 93 | T parm4; |
| 94 | }; |
| 95 | |
| 96 | #if KMP_WEIGHTED_ITERATIONS_SUPPORTED |
| 97 | UT pchunks; // total number of chunks for processes with p-core |
| 98 | UT num_procs_with_pcore; // number of threads with p-core |
| 99 | T first_thread_with_ecore; |
| 100 | #endif |
| 101 | #if KMP_OS_WINDOWS |
| 102 | T last_upper; |
| 103 | #endif /* KMP_OS_WINDOWS */ |
| 104 | }; |
| 105 | |
| 106 | #else /* KMP_STATIC_STEAL_ENABLED */ |
| 107 | |
| 108 | // replaces dispatch_private_info{32,64} structures and |
| 109 | // dispatch_private_info{32,64}_t types |
| 110 | template <typename T> struct dispatch_private_infoXX_template { |
| 111 | typedef typename traits_t<T>::unsigned_t UT; |
| 112 | typedef typename traits_t<T>::signed_t ST; |
| 113 | T lb; |
| 114 | T ub; |
| 115 | ST st; // signed |
| 116 | UT tc; // unsigned |
| 117 | |
| 118 | T parm1; |
| 119 | T parm2; |
| 120 | T parm3; |
| 121 | T parm4; |
| 122 | |
| 123 | UT count; // unsigned |
| 124 | |
| 125 | UT ordered_lower; // unsigned |
| 126 | UT ordered_upper; // unsigned |
| 127 | #if KMP_OS_WINDOWS |
| 128 | T last_upper; |
| 129 | #endif /* KMP_OS_WINDOWS */ |
| 130 | }; |
| 131 | #endif /* KMP_STATIC_STEAL_ENABLED */ |
| 132 | |
| 133 | template <typename T> struct KMP_ALIGN_CACHE dispatch_private_info_template { |
| 134 | // duplicate alignment here, otherwise size of structure is not correct in our |
| 135 | // compiler |
| 136 | union KMP_ALIGN_CACHE private_info_tmpl { |
| 137 | dispatch_private_infoXX_template<T> p; |
| 138 | dispatch_private_info64_t p64; |
| 139 | } u; |
| 140 | enum sched_type schedule; /* scheduling algorithm */ |
| 141 | kmp_sched_flags_t flags; /* flags (e.g., ordered, nomerge, etc.) */ |
| 142 | std::atomic<kmp_uint32> steal_flag; // static_steal only, state of a buffer |
| 143 | kmp_uint32 ordered_bumped; |
| 144 | dispatch_private_info *next; /* stack of buffers for nest of serial regions */ |
| 145 | kmp_uint32 type_size; |
| 146 | #if KMP_USE_HIER_SCHED |
| 147 | kmp_int32 hier_id; |
| 148 | kmp_hier_top_unit_t<T> *hier_parent; |
| 149 | // member functions |
| 150 | kmp_int32 get_hier_id() const { return hier_id; } |
| 151 | kmp_hier_top_unit_t<T> *get_parent() { return hier_parent; } |
| 152 | #endif |
| 153 | enum cons_type pushed_ws; |
| 154 | }; |
| 155 | |
| 156 | // replaces dispatch_shared_info{32,64} structures and |
| 157 | // dispatch_shared_info{32,64}_t types |
| 158 | template <typename T> struct dispatch_shared_infoXX_template { |
| 159 | typedef typename traits_t<T>::unsigned_t UT; |
| 160 | typedef typename traits_t<T>::signed_t ST; |
| 161 | /* chunk index under dynamic, number of idle threads under static-steal; |
| 162 | iteration index otherwise */ |
| 163 | volatile UT iteration; |
| 164 | volatile ST num_done; |
| 165 | volatile UT ordered_iteration; |
| 166 | // to retain the structure size making ordered_iteration scalar |
| 167 | UT ordered_dummy[KMP_MAX_ORDERED - 3]; |
| 168 | }; |
| 169 | |
| 170 | // replaces dispatch_shared_info structure and dispatch_shared_info_t type |
| 171 | template <typename T> struct dispatch_shared_info_template { |
| 172 | typedef typename traits_t<T>::unsigned_t UT; |
| 173 | // we need union here to keep the structure size |
| 174 | union shared_info_tmpl { |
| 175 | dispatch_shared_infoXX_template<UT> s; |
| 176 | dispatch_shared_info64_t s64; |
| 177 | } u; |
| 178 | volatile kmp_uint32 buffer_index; |
| 179 | volatile kmp_int32 doacross_buf_idx; // teamwise index |
| 180 | kmp_uint32 *doacross_flags; // array of iteration flags (0/1) |
| 181 | kmp_int32 doacross_num_done; // count finished threads |
| 182 | #if KMP_USE_HIER_SCHED |
| 183 | kmp_hier_t<T> *hier; |
| 184 | #endif |
| 185 | #if KMP_USE_HWLOC |
| 186 | // When linking with libhwloc, the ORDERED EPCC test slowsdown on big |
| 187 | // machines (> 48 cores). Performance analysis showed that a cache thrash |
| 188 | // was occurring and this padding helps alleviate the problem. |
| 189 | char padding[64]; |
| 190 | #endif |
| 191 | }; |
| 192 | |
| 193 | /* ------------------------------------------------------------------------ */ |
| 194 | /* ------------------------------------------------------------------------ */ |
| 195 | |
| 196 | #undef USE_TEST_LOCKS |
| 197 | |
| 198 | // test_then_add template (general template should NOT be used) |
| 199 | template <typename T> static __forceinline T test_then_add(volatile T *p, T d); |
| 200 | |
| 201 | template <> |
| 202 | __forceinline kmp_int32 test_then_add<kmp_int32>(volatile kmp_int32 *p, |
| 203 | kmp_int32 d) { |
| 204 | kmp_int32 r; |
| 205 | r = KMP_TEST_THEN_ADD32(p, d); |
| 206 | return r; |
| 207 | } |
| 208 | |
| 209 | template <> |
| 210 | __forceinline kmp_int64 test_then_add<kmp_int64>(volatile kmp_int64 *p, |
| 211 | kmp_int64 d) { |
| 212 | kmp_int64 r; |
| 213 | r = KMP_TEST_THEN_ADD64(p, d); |
| 214 | return r; |
| 215 | } |
| 216 | |
| 217 | // test_then_inc_acq template (general template should NOT be used) |
| 218 | template <typename T> static __forceinline T test_then_inc_acq(volatile T *p); |
| 219 | |
| 220 | template <> |
| 221 | __forceinline kmp_int32 test_then_inc_acq<kmp_int32>(volatile kmp_int32 *p) { |
| 222 | kmp_int32 r; |
| 223 | r = KMP_TEST_THEN_INC_ACQ32(p); |
| 224 | return r; |
| 225 | } |
| 226 | |
| 227 | template <> |
| 228 | __forceinline kmp_int64 test_then_inc_acq<kmp_int64>(volatile kmp_int64 *p) { |
| 229 | kmp_int64 r; |
| 230 | r = KMP_TEST_THEN_INC_ACQ64(p); |
| 231 | return r; |
| 232 | } |
| 233 | |
| 234 | // test_then_inc template (general template should NOT be used) |
| 235 | template <typename T> static __forceinline T test_then_inc(volatile T *p); |
| 236 | |
| 237 | template <> |
| 238 | __forceinline kmp_int32 test_then_inc<kmp_int32>(volatile kmp_int32 *p) { |
| 239 | kmp_int32 r; |
| 240 | r = KMP_TEST_THEN_INC32(p); |
| 241 | return r; |
| 242 | } |
| 243 | |
| 244 | template <> |
| 245 | __forceinline kmp_int64 test_then_inc<kmp_int64>(volatile kmp_int64 *p) { |
| 246 | kmp_int64 r; |
| 247 | r = KMP_TEST_THEN_INC64(p); |
| 248 | return r; |
| 249 | } |
| 250 | |
| 251 | // compare_and_swap template (general template should NOT be used) |
| 252 | template <typename T> |
| 253 | static __forceinline kmp_int32 compare_and_swap(volatile T *p, T c, T s); |
| 254 | |
| 255 | template <> |
| 256 | __forceinline kmp_int32 compare_and_swap<kmp_int32>(volatile kmp_int32 *p, |
| 257 | kmp_int32 c, kmp_int32 s) { |
| 258 | return KMP_COMPARE_AND_STORE_REL32(p, c, s); |
| 259 | } |
| 260 | |
| 261 | template <> |
| 262 | __forceinline kmp_int32 compare_and_swap<kmp_int64>(volatile kmp_int64 *p, |
| 263 | kmp_int64 c, kmp_int64 s) { |
| 264 | return KMP_COMPARE_AND_STORE_REL64(p, c, s); |
| 265 | } |
| 266 | |
| 267 | template <typename T> kmp_uint32 __kmp_ge(T value, T checker) { |
| 268 | return value >= checker; |
| 269 | } |
| 270 | template <typename T> kmp_uint32 __kmp_eq(T value, T checker) { |
| 271 | return value == checker; |
| 272 | } |
| 273 | |
| 274 | /* |
| 275 | Spin wait loop that pauses between checks. |
| 276 | Waits until function returns non-zero when called with *spinner and check. |
| 277 | Does NOT put threads to sleep. |
| 278 | Arguments: |
| 279 | UT is unsigned 4- or 8-byte type |
| 280 | spinner - memory location to check value |
| 281 | checker - value which spinner is >, <, ==, etc. |
| 282 | pred - predicate function to perform binary comparison of some sort |
| 283 | #if USE_ITT_BUILD |
| 284 | obj -- is higher-level synchronization object to report to ittnotify. It |
| 285 | is used to report locks consistently. For example, if lock is acquired |
| 286 | immediately, its address is reported to ittnotify via |
| 287 | KMP_FSYNC_ACQUIRED(). However, it lock cannot be acquired immediately |
| 288 | and lock routine calls to KMP_WAIT(), the later should report the |
| 289 | same address, not an address of low-level spinner. |
| 290 | #endif // USE_ITT_BUILD |
| 291 | TODO: make inline function (move to header file for icl) |
| 292 | */ |
| 293 | template <typename UT> |
| 294 | static UT __kmp_wait(volatile UT *spinner, UT checker, |
| 295 | kmp_uint32 (*pred)(UT, UT) USE_ITT_BUILD_ARG(void *obj)) { |
| 296 | // note: we may not belong to a team at this point |
| 297 | volatile UT *spin = spinner; |
| 298 | UT check = checker; |
| 299 | kmp_uint32 spins; |
| 300 | kmp_uint32 (*f)(UT, UT) = pred; |
| 301 | kmp_uint64 time; |
| 302 | UT r; |
| 303 | |
| 304 | KMP_FSYNC_SPIN_INIT(obj, CCAST(UT *, spin)); |
| 305 | KMP_INIT_YIELD(spins); |
| 306 | KMP_INIT_BACKOFF(time); |
| 307 | // main wait spin loop |
| 308 | while (!f(r = *spin, check)) { |
| 309 | KMP_FSYNC_SPIN_PREPARE(obj); |
| 310 | /* GEH - remove this since it was accidentally introduced when kmp_wait was |
| 311 | split. |
| 312 | It causes problems with infinite recursion because of exit lock */ |
| 313 | /* if ( TCR_4(__kmp_global.g.g_done) && __kmp_global.g.g_abort) |
| 314 | __kmp_abort_thread(); */ |
| 315 | // If oversubscribed, or have waited a bit then yield. |
| 316 | KMP_YIELD_OVERSUB_ELSE_SPIN(spins, time); |
| 317 | } |
| 318 | KMP_FSYNC_SPIN_ACQUIRED(obj); |
| 319 | return r; |
| 320 | } |
| 321 | |
| 322 | /* ------------------------------------------------------------------------ */ |
| 323 | /* ------------------------------------------------------------------------ */ |
| 324 | |
| 325 | template <typename UT> |
| 326 | void __kmp_dispatch_deo(int *gtid_ref, int *cid_ref, ident_t *loc_ref) { |
| 327 | dispatch_private_info_template<UT> *pr; |
| 328 | |
| 329 | int gtid = *gtid_ref; |
| 330 | // int cid = *cid_ref; |
| 331 | kmp_info_t *th = __kmp_threads[gtid]; |
| 332 | KMP_DEBUG_ASSERT(th->th.th_dispatch); |
| 333 | |
| 334 | KD_TRACE(100, ("__kmp_dispatch_deo: T#%d called\n" , gtid)); |
| 335 | if (__kmp_env_consistency_check) { |
| 336 | pr = reinterpret_cast<dispatch_private_info_template<UT> *>( |
| 337 | th->th.th_dispatch->th_dispatch_pr_current); |
| 338 | if (pr->pushed_ws != ct_none) { |
| 339 | #if KMP_USE_DYNAMIC_LOCK |
| 340 | __kmp_push_sync(gtid, ct: ct_ordered_in_pdo, ident: loc_ref, NULL, 0); |
| 341 | #else |
| 342 | __kmp_push_sync(gtid, ct_ordered_in_pdo, loc_ref, NULL); |
| 343 | #endif |
| 344 | } |
| 345 | } |
| 346 | |
| 347 | if (!th->th.th_team->t.t_serialized) { |
| 348 | dispatch_shared_info_template<UT> *sh = |
| 349 | reinterpret_cast<dispatch_shared_info_template<UT> *>( |
| 350 | th->th.th_dispatch->th_dispatch_sh_current); |
| 351 | UT lower; |
| 352 | |
| 353 | if (!__kmp_env_consistency_check) { |
| 354 | pr = reinterpret_cast<dispatch_private_info_template<UT> *>( |
| 355 | th->th.th_dispatch->th_dispatch_pr_current); |
| 356 | } |
| 357 | lower = pr->u.p.ordered_lower; |
| 358 | |
| 359 | #if !defined(KMP_GOMP_COMPAT) |
| 360 | if (__kmp_env_consistency_check) { |
| 361 | if (pr->ordered_bumped) { |
| 362 | struct cons_header *p = __kmp_threads[gtid]->th.th_cons; |
| 363 | __kmp_error_construct2(kmp_i18n_msg_CnsMultipleNesting, |
| 364 | ct_ordered_in_pdo, loc_ref, |
| 365 | &p->stack_data[p->w_top]); |
| 366 | } |
| 367 | } |
| 368 | #endif /* !defined(KMP_GOMP_COMPAT) */ |
| 369 | |
| 370 | KMP_MB(); |
| 371 | #ifdef KMP_DEBUG |
| 372 | { |
| 373 | char *buff; |
| 374 | // create format specifiers before the debug output |
| 375 | buff = __kmp_str_format("__kmp_dispatch_deo: T#%%d before wait: " |
| 376 | "ordered_iter:%%%s lower:%%%s\n" , |
| 377 | traits_t<UT>::spec, traits_t<UT>::spec); |
| 378 | KD_TRACE(1000, (buff, gtid, sh->u.s.ordered_iteration, lower)); |
| 379 | __kmp_str_free(str: &buff); |
| 380 | } |
| 381 | #endif |
| 382 | __kmp_wait<UT>(&sh->u.s.ordered_iteration, lower, |
| 383 | __kmp_ge<UT> USE_ITT_BUILD_ARG(NULL)); |
| 384 | KMP_MB(); /* is this necessary? */ |
| 385 | #ifdef KMP_DEBUG |
| 386 | { |
| 387 | char *buff; |
| 388 | // create format specifiers before the debug output |
| 389 | buff = __kmp_str_format("__kmp_dispatch_deo: T#%%d after wait: " |
| 390 | "ordered_iter:%%%s lower:%%%s\n" , |
| 391 | traits_t<UT>::spec, traits_t<UT>::spec); |
| 392 | KD_TRACE(1000, (buff, gtid, sh->u.s.ordered_iteration, lower)); |
| 393 | __kmp_str_free(str: &buff); |
| 394 | } |
| 395 | #endif |
| 396 | } |
| 397 | KD_TRACE(100, ("__kmp_dispatch_deo: T#%d returned\n" , gtid)); |
| 398 | } |
| 399 | |
| 400 | template <typename UT> |
| 401 | void __kmp_dispatch_dxo(int *gtid_ref, int *cid_ref, ident_t *loc_ref) { |
| 402 | typedef typename traits_t<UT>::signed_t ST; |
| 403 | dispatch_private_info_template<UT> *pr; |
| 404 | |
| 405 | int gtid = *gtid_ref; |
| 406 | // int cid = *cid_ref; |
| 407 | kmp_info_t *th = __kmp_threads[gtid]; |
| 408 | KMP_DEBUG_ASSERT(th->th.th_dispatch); |
| 409 | |
| 410 | KD_TRACE(100, ("__kmp_dispatch_dxo: T#%d called\n" , gtid)); |
| 411 | if (__kmp_env_consistency_check) { |
| 412 | pr = reinterpret_cast<dispatch_private_info_template<UT> *>( |
| 413 | th->th.th_dispatch->th_dispatch_pr_current); |
| 414 | if (pr->pushed_ws != ct_none) { |
| 415 | __kmp_pop_sync(gtid, ct: ct_ordered_in_pdo, ident: loc_ref); |
| 416 | } |
| 417 | } |
| 418 | |
| 419 | if (!th->th.th_team->t.t_serialized) { |
| 420 | dispatch_shared_info_template<UT> *sh = |
| 421 | reinterpret_cast<dispatch_shared_info_template<UT> *>( |
| 422 | th->th.th_dispatch->th_dispatch_sh_current); |
| 423 | |
| 424 | if (!__kmp_env_consistency_check) { |
| 425 | pr = reinterpret_cast<dispatch_private_info_template<UT> *>( |
| 426 | th->th.th_dispatch->th_dispatch_pr_current); |
| 427 | } |
| 428 | |
| 429 | KMP_FSYNC_RELEASING(CCAST(UT *, &sh->u.s.ordered_iteration)); |
| 430 | #if !defined(KMP_GOMP_COMPAT) |
| 431 | if (__kmp_env_consistency_check) { |
| 432 | if (pr->ordered_bumped != 0) { |
| 433 | struct cons_header *p = __kmp_threads[gtid]->th.th_cons; |
| 434 | /* How to test it? - OM */ |
| 435 | __kmp_error_construct2(kmp_i18n_msg_CnsMultipleNesting, |
| 436 | ct_ordered_in_pdo, loc_ref, |
| 437 | &p->stack_data[p->w_top]); |
| 438 | } |
| 439 | } |
| 440 | #endif /* !defined(KMP_GOMP_COMPAT) */ |
| 441 | |
| 442 | KMP_MB(); /* Flush all pending memory write invalidates. */ |
| 443 | |
| 444 | pr->ordered_bumped += 1; |
| 445 | |
| 446 | KD_TRACE(1000, |
| 447 | ("__kmp_dispatch_dxo: T#%d bumping ordered ordered_bumped=%d\n" , |
| 448 | gtid, pr->ordered_bumped)); |
| 449 | |
| 450 | KMP_MB(); /* Flush all pending memory write invalidates. */ |
| 451 | |
| 452 | /* TODO use general release procedure? */ |
| 453 | test_then_inc<ST>((volatile ST *)&sh->u.s.ordered_iteration); |
| 454 | |
| 455 | KMP_MB(); /* Flush all pending memory write invalidates. */ |
| 456 | } |
| 457 | KD_TRACE(100, ("__kmp_dispatch_dxo: T#%d returned\n" , gtid)); |
| 458 | } |
| 459 | |
| 460 | /* Computes and returns x to the power of y, where y must a non-negative integer |
| 461 | */ |
| 462 | template <typename UT> |
| 463 | static __forceinline long double __kmp_pow(long double x, UT y) { |
| 464 | long double s = 1.0L; |
| 465 | |
| 466 | KMP_DEBUG_ASSERT(x > 0.0 && x < 1.0); |
| 467 | // KMP_DEBUG_ASSERT(y >= 0); // y is unsigned |
| 468 | while (y) { |
| 469 | if (y & 1) |
| 470 | s *= x; |
| 471 | x *= x; |
| 472 | y >>= 1; |
| 473 | } |
| 474 | return s; |
| 475 | } |
| 476 | |
| 477 | /* Computes and returns the number of unassigned iterations after idx chunks |
| 478 | have been assigned |
| 479 | (the total number of unassigned iterations in chunks with index greater than |
| 480 | or equal to idx). |
| 481 | __forceinline seems to be broken so that if we __forceinline this function, |
| 482 | the behavior is wrong |
| 483 | (one of the unit tests, sch_guided_analytical_basic.cpp, fails) |
| 484 | */ |
| 485 | template <typename T> |
| 486 | static __inline typename traits_t<T>::unsigned_t |
| 487 | __kmp_dispatch_guided_remaining(T tc, typename traits_t<T>::floating_t base, |
| 488 | typename traits_t<T>::unsigned_t idx) { |
| 489 | /* Note: On Windows* OS on IA-32 architecture and Intel(R) 64, at |
| 490 | least for ICL 8.1, long double arithmetic may not really have |
| 491 | long double precision, even with /Qlong_double. Currently, we |
| 492 | workaround that in the caller code, by manipulating the FPCW for |
| 493 | Windows* OS on IA-32 architecture. The lack of precision is not |
| 494 | expected to be a correctness issue, though. |
| 495 | */ |
| 496 | typedef typename traits_t<T>::unsigned_t UT; |
| 497 | |
| 498 | long double x = tc * __kmp_pow<UT>(base, idx); |
| 499 | UT r = (UT)x; |
| 500 | if (x == r) |
| 501 | return r; |
| 502 | return r + 1; |
| 503 | } |
| 504 | |
| 505 | // Parameters of the guided-iterative algorithm: |
| 506 | // p2 = n * nproc * ( chunk + 1 ) // point of switching to dynamic |
| 507 | // p3 = 1 / ( n * nproc ) // remaining iterations multiplier |
| 508 | // by default n = 2. For example with n = 3 the chunks distribution will be more |
| 509 | // flat. |
| 510 | // With n = 1 first chunk is the same as for static schedule, e.g. trip / nproc. |
| 511 | static const int guided_int_param = 2; |
| 512 | static const double guided_flt_param = 0.5; // = 1.0 / guided_int_param; |
| 513 | #endif // KMP_DISPATCH_H |
| 514 | |