1 | /* |
2 | * kmp_barrier.cpp |
3 | */ |
4 | |
5 | //===----------------------------------------------------------------------===// |
6 | // |
7 | // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. |
8 | // See https://llvm.org/LICENSE.txt for license information. |
9 | // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception |
10 | // |
11 | //===----------------------------------------------------------------------===// |
12 | |
13 | #include "kmp_wait_release.h" |
14 | #include "kmp_barrier.h" |
15 | #include "kmp_itt.h" |
16 | #include "kmp_os.h" |
17 | #include "kmp_stats.h" |
18 | #include "ompt-specific.h" |
19 | // for distributed barrier |
20 | #include "kmp_affinity.h" |
21 | |
22 | #if KMP_MIC |
23 | #include <immintrin.h> |
24 | #define USE_NGO_STORES 1 |
25 | #endif // KMP_MIC |
26 | |
27 | #if KMP_MIC && USE_NGO_STORES |
28 | // ICV copying |
29 | #define ngo_load(src) __m512d Vt = _mm512_load_pd((void *)(src)) |
30 | #define ngo_store_icvs(dst, src) _mm512_storenrngo_pd((void *)(dst), Vt) |
31 | #define ngo_store_go(dst, src) _mm512_storenrngo_pd((void *)(dst), Vt) |
32 | #define ngo_sync() __asm__ volatile("lock; addl $0,0(%%rsp)" ::: "memory") |
33 | #else |
34 | #define ngo_load(src) ((void)0) |
35 | #define ngo_store_icvs(dst, src) copy_icvs((dst), (src)) |
36 | #define ngo_store_go(dst, src) KMP_MEMCPY((dst), (src), CACHE_LINE) |
37 | #define ngo_sync() ((void)0) |
38 | #endif /* KMP_MIC && USE_NGO_STORES */ |
39 | |
40 | void __kmp_print_structure(void); // Forward declaration |
41 | |
42 | // ---------------------------- Barrier Algorithms ---------------------------- |
43 | // Distributed barrier |
44 | |
45 | // Compute how many threads to have polling each cache-line. |
46 | // We want to limit the number of writes to IDEAL_GO_RESOLUTION. |
47 | void distributedBarrier::computeVarsForN(size_t n) { |
48 | int nsockets = 1; |
49 | if (__kmp_topology) { |
50 | int socket_level = __kmp_topology->get_level(type: KMP_HW_SOCKET); |
51 | int core_level = __kmp_topology->get_level(type: KMP_HW_CORE); |
52 | int ncores_per_socket = |
53 | __kmp_topology->calculate_ratio(level1: core_level, level2: socket_level); |
54 | nsockets = __kmp_topology->get_count(level: socket_level); |
55 | |
56 | if (nsockets <= 0) |
57 | nsockets = 1; |
58 | if (ncores_per_socket <= 0) |
59 | ncores_per_socket = 1; |
60 | |
61 | threads_per_go = ncores_per_socket >> 1; |
62 | if (!fix_threads_per_go) { |
63 | // Minimize num_gos |
64 | if (threads_per_go > 4) { |
65 | if (KMP_OPTIMIZE_FOR_REDUCTIONS) { |
66 | threads_per_go = threads_per_go >> 1; |
67 | } |
68 | if (threads_per_go > 4 && nsockets == 1) |
69 | threads_per_go = threads_per_go >> 1; |
70 | } |
71 | } |
72 | if (threads_per_go == 0) |
73 | threads_per_go = 1; |
74 | fix_threads_per_go = true; |
75 | num_gos = n / threads_per_go; |
76 | if (n % threads_per_go) |
77 | num_gos++; |
78 | if (nsockets == 1 || num_gos == 1) |
79 | num_groups = 1; |
80 | else { |
81 | num_groups = num_gos / nsockets; |
82 | if (num_gos % nsockets) |
83 | num_groups++; |
84 | } |
85 | if (num_groups <= 0) |
86 | num_groups = 1; |
87 | gos_per_group = num_gos / num_groups; |
88 | if (num_gos % num_groups) |
89 | gos_per_group++; |
90 | threads_per_group = threads_per_go * gos_per_group; |
91 | } else { |
92 | num_gos = n / threads_per_go; |
93 | if (n % threads_per_go) |
94 | num_gos++; |
95 | if (num_gos == 1) |
96 | num_groups = 1; |
97 | else { |
98 | num_groups = num_gos / 2; |
99 | if (num_gos % 2) |
100 | num_groups++; |
101 | } |
102 | gos_per_group = num_gos / num_groups; |
103 | if (num_gos % num_groups) |
104 | gos_per_group++; |
105 | threads_per_group = threads_per_go * gos_per_group; |
106 | } |
107 | } |
108 | |
109 | void distributedBarrier::computeGo(size_t n) { |
110 | // Minimize num_gos |
111 | for (num_gos = 1;; num_gos++) |
112 | if (IDEAL_CONTENTION * num_gos >= n) |
113 | break; |
114 | threads_per_go = n / num_gos; |
115 | if (n % num_gos) |
116 | threads_per_go++; |
117 | while (num_gos > MAX_GOS) { |
118 | threads_per_go++; |
119 | num_gos = n / threads_per_go; |
120 | if (n % threads_per_go) |
121 | num_gos++; |
122 | } |
123 | computeVarsForN(n); |
124 | } |
125 | |
126 | // This function is to resize the barrier arrays when the new number of threads |
127 | // exceeds max_threads, which is the current size of all the arrays |
128 | void distributedBarrier::resize(size_t nthr) { |
129 | KMP_DEBUG_ASSERT(nthr > max_threads); |
130 | |
131 | // expand to requested size * 2 |
132 | max_threads = nthr * 2; |
133 | |
134 | // allocate arrays to new max threads |
135 | for (int i = 0; i < MAX_ITERS; ++i) { |
136 | if (flags[i]) |
137 | flags[i] = (flags_s *)KMP_INTERNAL_REALLOC(flags[i], |
138 | max_threads * sizeof(flags_s)); |
139 | else |
140 | flags[i] = (flags_s *)KMP_INTERNAL_MALLOC(max_threads * sizeof(flags_s)); |
141 | } |
142 | |
143 | if (go) |
144 | go = (go_s *)KMP_INTERNAL_REALLOC(go, max_threads * sizeof(go_s)); |
145 | else |
146 | go = (go_s *)KMP_INTERNAL_MALLOC(max_threads * sizeof(go_s)); |
147 | |
148 | if (iter) |
149 | iter = (iter_s *)KMP_INTERNAL_REALLOC(iter, max_threads * sizeof(iter_s)); |
150 | else |
151 | iter = (iter_s *)KMP_INTERNAL_MALLOC(max_threads * sizeof(iter_s)); |
152 | |
153 | if (sleep) |
154 | sleep = |
155 | (sleep_s *)KMP_INTERNAL_REALLOC(sleep, max_threads * sizeof(sleep_s)); |
156 | else |
157 | sleep = (sleep_s *)KMP_INTERNAL_MALLOC(max_threads * sizeof(sleep_s)); |
158 | } |
159 | |
160 | // This function is to set all the go flags that threads might be waiting |
161 | // on, and when blocktime is not infinite, it should be followed by a wake-up |
162 | // call to each thread |
163 | kmp_uint64 distributedBarrier::go_release() { |
164 | kmp_uint64 next_go = iter[0].iter + distributedBarrier::MAX_ITERS; |
165 | for (size_t j = 0; j < num_gos; j++) { |
166 | go[j].go.store(i: next_go); |
167 | } |
168 | return next_go; |
169 | } |
170 | |
171 | void distributedBarrier::go_reset() { |
172 | for (size_t j = 0; j < max_threads; ++j) { |
173 | for (size_t i = 0; i < distributedBarrier::MAX_ITERS; ++i) { |
174 | flags[i][j].stillNeed = 1; |
175 | } |
176 | go[j].go.store(i: 0); |
177 | iter[j].iter = 0; |
178 | } |
179 | } |
180 | |
181 | // This function inits/re-inits the distributed barrier for a particular number |
182 | // of threads. If a resize of arrays is needed, it calls the resize function. |
183 | void distributedBarrier::init(size_t nthr) { |
184 | size_t old_max = max_threads; |
185 | if (nthr > max_threads) { // need more space in arrays |
186 | resize(nthr); |
187 | } |
188 | |
189 | for (size_t i = 0; i < max_threads; i++) { |
190 | for (size_t j = 0; j < distributedBarrier::MAX_ITERS; j++) { |
191 | flags[j][i].stillNeed = 1; |
192 | } |
193 | go[i].go.store(i: 0); |
194 | iter[i].iter = 0; |
195 | if (i >= old_max) |
196 | sleep[i].sleep = false; |
197 | } |
198 | |
199 | // Recalculate num_gos, etc. based on new nthr |
200 | computeVarsForN(n: nthr); |
201 | |
202 | num_threads = nthr; |
203 | |
204 | if (team_icvs == NULL) |
205 | team_icvs = __kmp_allocate(sizeof(kmp_internal_control_t)); |
206 | } |
207 | |
208 | // This function is used only when KMP_BLOCKTIME is not infinite. |
209 | // static |
210 | void __kmp_dist_barrier_wakeup(enum barrier_type bt, kmp_team_t *team, |
211 | size_t start, size_t stop, size_t inc, |
212 | size_t tid) { |
213 | KMP_DEBUG_ASSERT(__kmp_dflt_blocktime != KMP_MAX_BLOCKTIME); |
214 | if (bt == bs_forkjoin_barrier && TCR_4(__kmp_global.g.g_done)) |
215 | return; |
216 | |
217 | kmp_info_t **other_threads = team->t.t_threads; |
218 | for (size_t thr = start; thr < stop; thr += inc) { |
219 | KMP_DEBUG_ASSERT(other_threads[thr]); |
220 | int gtid = other_threads[thr]->th.th_info.ds.ds_gtid; |
221 | // Wake up worker regardless of if it appears to be sleeping or not |
222 | __kmp_atomic_resume_64(target_gtid: gtid, flag: (kmp_atomic_flag_64<> *)NULL); |
223 | } |
224 | } |
225 | |
226 | static void __kmp_dist_barrier_gather( |
227 | enum barrier_type bt, kmp_info_t *this_thr, int gtid, int tid, |
228 | void (*reduce)(void *, void *) USE_ITT_BUILD_ARG(void *itt_sync_obj)) { |
229 | KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_dist_gather); |
230 | kmp_team_t *team; |
231 | distributedBarrier *b; |
232 | kmp_info_t **other_threads; |
233 | kmp_uint64 my_current_iter, my_next_iter; |
234 | kmp_uint32 nproc; |
235 | bool group_leader; |
236 | |
237 | team = this_thr->th.th_team; |
238 | nproc = this_thr->th.th_team_nproc; |
239 | other_threads = team->t.t_threads; |
240 | b = team->t.b; |
241 | my_current_iter = b->iter[tid].iter; |
242 | my_next_iter = (my_current_iter + 1) % distributedBarrier::MAX_ITERS; |
243 | group_leader = ((tid % b->threads_per_group) == 0); |
244 | |
245 | KA_TRACE(20, |
246 | ("__kmp_dist_barrier_gather: T#%d(%d:%d) enter; barrier type %d\n" , |
247 | gtid, team->t.t_id, tid, bt)); |
248 | |
249 | #if USE_ITT_BUILD && USE_ITT_NOTIFY |
250 | // Barrier imbalance - save arrive time to the thread |
251 | if (__kmp_forkjoin_frames_mode == 3 || __kmp_forkjoin_frames_mode == 2) { |
252 | this_thr->th.th_bar_arrive_time = this_thr->th.th_bar_min_time = |
253 | __itt_get_timestamp(); |
254 | } |
255 | #endif |
256 | |
257 | if (group_leader) { |
258 | // Start from the thread after the group leader |
259 | size_t group_start = tid + 1; |
260 | size_t group_end = tid + b->threads_per_group; |
261 | size_t threads_pending = 0; |
262 | |
263 | if (group_end > nproc) |
264 | group_end = nproc; |
265 | do { // wait for threads in my group |
266 | threads_pending = 0; |
267 | // Check all the flags every time to avoid branch misspredict |
268 | for (size_t thr = group_start; thr < group_end; thr++) { |
269 | // Each thread uses a different cache line |
270 | threads_pending += b->flags[my_current_iter][thr].stillNeed; |
271 | } |
272 | // Execute tasks here |
273 | if (__kmp_tasking_mode != tskm_immediate_exec) { |
274 | kmp_task_team_t *task_team = this_thr->th.th_task_team; |
275 | if (task_team != NULL) { |
276 | if (TCR_SYNC_4(task_team->tt.tt_active)) { |
277 | if (KMP_TASKING_ENABLED(task_team)) { |
278 | int tasks_completed = FALSE; |
279 | __kmp_atomic_execute_tasks_64( |
280 | thread: this_thr, gtid, flag: (kmp_atomic_flag_64<> *)NULL, FALSE, |
281 | thread_finished: &tasks_completed USE_ITT_BUILD_ARG(itt_sync_obj), is_constrained: 0); |
282 | } else |
283 | this_thr->th.th_reap_state = KMP_SAFE_TO_REAP; |
284 | } |
285 | } else { |
286 | this_thr->th.th_reap_state = KMP_SAFE_TO_REAP; |
287 | } // if |
288 | } |
289 | if (TCR_4(__kmp_global.g.g_done)) { |
290 | if (__kmp_global.g.g_abort) |
291 | __kmp_abort_thread(); |
292 | break; |
293 | } else if (__kmp_tasking_mode != tskm_immediate_exec && |
294 | this_thr->th.th_reap_state == KMP_SAFE_TO_REAP) { |
295 | this_thr->th.th_reap_state = KMP_NOT_SAFE_TO_REAP; |
296 | } |
297 | } while (threads_pending > 0); |
298 | |
299 | if (reduce) { // Perform reduction if needed |
300 | OMPT_REDUCTION_DECL(this_thr, gtid); |
301 | OMPT_REDUCTION_BEGIN; |
302 | // Group leader reduces all threads in group |
303 | for (size_t thr = group_start; thr < group_end; thr++) { |
304 | (*reduce)(this_thr->th.th_local.reduce_data, |
305 | other_threads[thr]->th.th_local.reduce_data); |
306 | } |
307 | OMPT_REDUCTION_END; |
308 | } |
309 | |
310 | // Set flag for next iteration |
311 | b->flags[my_next_iter][tid].stillNeed = 1; |
312 | // Each thread uses a different cache line; resets stillNeed to 0 to |
313 | // indicate it has reached the barrier |
314 | b->flags[my_current_iter][tid].stillNeed = 0; |
315 | |
316 | do { // wait for all group leaders |
317 | threads_pending = 0; |
318 | for (size_t thr = 0; thr < nproc; thr += b->threads_per_group) { |
319 | threads_pending += b->flags[my_current_iter][thr].stillNeed; |
320 | } |
321 | // Execute tasks here |
322 | if (__kmp_tasking_mode != tskm_immediate_exec) { |
323 | kmp_task_team_t *task_team = this_thr->th.th_task_team; |
324 | if (task_team != NULL) { |
325 | if (TCR_SYNC_4(task_team->tt.tt_active)) { |
326 | if (KMP_TASKING_ENABLED(task_team)) { |
327 | int tasks_completed = FALSE; |
328 | __kmp_atomic_execute_tasks_64( |
329 | thread: this_thr, gtid, flag: (kmp_atomic_flag_64<> *)NULL, FALSE, |
330 | thread_finished: &tasks_completed USE_ITT_BUILD_ARG(itt_sync_obj), is_constrained: 0); |
331 | } else |
332 | this_thr->th.th_reap_state = KMP_SAFE_TO_REAP; |
333 | } |
334 | } else { |
335 | this_thr->th.th_reap_state = KMP_SAFE_TO_REAP; |
336 | } // if |
337 | } |
338 | if (TCR_4(__kmp_global.g.g_done)) { |
339 | if (__kmp_global.g.g_abort) |
340 | __kmp_abort_thread(); |
341 | break; |
342 | } else if (__kmp_tasking_mode != tskm_immediate_exec && |
343 | this_thr->th.th_reap_state == KMP_SAFE_TO_REAP) { |
344 | this_thr->th.th_reap_state = KMP_NOT_SAFE_TO_REAP; |
345 | } |
346 | } while (threads_pending > 0); |
347 | |
348 | if (reduce) { // Perform reduction if needed |
349 | if (KMP_MASTER_TID(tid)) { // Master reduces over group leaders |
350 | OMPT_REDUCTION_DECL(this_thr, gtid); |
351 | OMPT_REDUCTION_BEGIN; |
352 | for (size_t thr = b->threads_per_group; thr < nproc; |
353 | thr += b->threads_per_group) { |
354 | (*reduce)(this_thr->th.th_local.reduce_data, |
355 | other_threads[thr]->th.th_local.reduce_data); |
356 | } |
357 | OMPT_REDUCTION_END; |
358 | } |
359 | } |
360 | } else { |
361 | // Set flag for next iteration |
362 | b->flags[my_next_iter][tid].stillNeed = 1; |
363 | // Each thread uses a different cache line; resets stillNeed to 0 to |
364 | // indicate it has reached the barrier |
365 | b->flags[my_current_iter][tid].stillNeed = 0; |
366 | } |
367 | |
368 | KMP_MFENCE(); |
369 | |
370 | KA_TRACE(20, |
371 | ("__kmp_dist_barrier_gather: T#%d(%d:%d) exit for barrier type %d\n" , |
372 | gtid, team->t.t_id, tid, bt)); |
373 | } |
374 | |
375 | static void __kmp_dist_barrier_release( |
376 | enum barrier_type bt, kmp_info_t *this_thr, int gtid, int tid, |
377 | int propagate_icvs USE_ITT_BUILD_ARG(void *itt_sync_obj)) { |
378 | KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_dist_release); |
379 | kmp_team_t *team; |
380 | distributedBarrier *b; |
381 | kmp_bstate_t *thr_bar; |
382 | kmp_uint64 my_current_iter, next_go; |
383 | size_t my_go_index; |
384 | bool group_leader; |
385 | |
386 | KA_TRACE(20, ("__kmp_dist_barrier_release: T#%d(%d) enter; barrier type %d\n" , |
387 | gtid, tid, bt)); |
388 | |
389 | thr_bar = &this_thr->th.th_bar[bt].bb; |
390 | |
391 | if (!KMP_MASTER_TID(tid)) { |
392 | // workers and non-master group leaders need to check their presence in team |
393 | do { |
394 | if (this_thr->th.th_used_in_team.load() != 1 && |
395 | this_thr->th.th_used_in_team.load() != 3) { |
396 | // Thread is not in use in a team. Wait on location in tid's thread |
397 | // struct. The 0 value tells anyone looking that this thread is spinning |
398 | // or sleeping until this location becomes 3 again; 3 is the transition |
399 | // state to get to 1 which is waiting on go and being in the team |
400 | kmp_flag_32<false, false> my_flag(&(this_thr->th.th_used_in_team), 3); |
401 | if (KMP_COMPARE_AND_STORE_ACQ32(&(this_thr->th.th_used_in_team), 2, |
402 | 0) || |
403 | this_thr->th.th_used_in_team.load() == 0) { |
404 | my_flag.wait(this_thr, final_spin: true USE_ITT_BUILD_ARG(itt_sync_obj)); |
405 | } |
406 | #if USE_ITT_BUILD && USE_ITT_NOTIFY |
407 | if ((__itt_sync_create_ptr && itt_sync_obj == NULL) || KMP_ITT_DEBUG) { |
408 | // In fork barrier where we could not get the object reliably |
409 | itt_sync_obj = |
410 | __kmp_itt_barrier_object(gtid, bt: bs_forkjoin_barrier, set_name: 0, delta: -1); |
411 | // Cancel wait on previous parallel region... |
412 | __kmp_itt_task_starting(object: itt_sync_obj); |
413 | |
414 | if (bt == bs_forkjoin_barrier && TCR_4(__kmp_global.g.g_done)) |
415 | return; |
416 | |
417 | itt_sync_obj = __kmp_itt_barrier_object(gtid, bt: bs_forkjoin_barrier); |
418 | if (itt_sync_obj != NULL) |
419 | // Call prepare as early as possible for "new" barrier |
420 | __kmp_itt_task_finished(object: itt_sync_obj); |
421 | } else |
422 | #endif /* USE_ITT_BUILD && USE_ITT_NOTIFY */ |
423 | if (bt == bs_forkjoin_barrier && TCR_4(__kmp_global.g.g_done)) |
424 | return; |
425 | } |
426 | if (this_thr->th.th_used_in_team.load() != 1 && |
427 | this_thr->th.th_used_in_team.load() != 3) // spurious wake-up? |
428 | continue; |
429 | if (bt == bs_forkjoin_barrier && TCR_4(__kmp_global.g.g_done)) |
430 | return; |
431 | |
432 | // At this point, the thread thinks it is in use in a team, or in |
433 | // transition to be used in a team, but it might have reached this barrier |
434 | // before it was marked unused by the team. Unused threads are awoken and |
435 | // shifted to wait on local thread struct elsewhere. It also might reach |
436 | // this point by being picked up for use by a different team. Either way, |
437 | // we need to update the tid. |
438 | tid = __kmp_tid_from_gtid(gtid); |
439 | team = this_thr->th.th_team; |
440 | KMP_DEBUG_ASSERT(tid >= 0); |
441 | KMP_DEBUG_ASSERT(team); |
442 | b = team->t.b; |
443 | my_current_iter = b->iter[tid].iter; |
444 | next_go = my_current_iter + distributedBarrier::MAX_ITERS; |
445 | my_go_index = tid / b->threads_per_go; |
446 | if (this_thr->th.th_used_in_team.load() == 3) { |
447 | KMP_COMPARE_AND_STORE_ACQ32(&(this_thr->th.th_used_in_team), 3, 1); |
448 | } |
449 | // Check if go flag is set |
450 | if (b->go[my_go_index].go.load() != next_go) { |
451 | // Wait on go flag on team |
452 | kmp_atomic_flag_64<false, true> my_flag( |
453 | &(b->go[my_go_index].go), next_go, &(b->sleep[tid].sleep)); |
454 | my_flag.wait(this_thr, final_spin: true USE_ITT_BUILD_ARG(itt_sync_obj)); |
455 | KMP_DEBUG_ASSERT(my_current_iter == b->iter[tid].iter || |
456 | b->iter[tid].iter == 0); |
457 | KMP_DEBUG_ASSERT(b->sleep[tid].sleep == false); |
458 | } |
459 | |
460 | if (bt == bs_forkjoin_barrier && TCR_4(__kmp_global.g.g_done)) |
461 | return; |
462 | // At this point, the thread's go location was set. This means the primary |
463 | // thread is safely in the barrier, and so this thread's data is |
464 | // up-to-date, but we should check again that this thread is really in |
465 | // use in the team, as it could have been woken up for the purpose of |
466 | // changing team size, or reaping threads at shutdown. |
467 | if (this_thr->th.th_used_in_team.load() == 1) |
468 | break; |
469 | } while (1); |
470 | |
471 | if (bt == bs_forkjoin_barrier && TCR_4(__kmp_global.g.g_done)) |
472 | return; |
473 | |
474 | group_leader = ((tid % b->threads_per_group) == 0); |
475 | if (group_leader) { |
476 | // Tell all the threads in my group they can go! |
477 | for (size_t go_idx = my_go_index + 1; |
478 | go_idx < my_go_index + b->gos_per_group; go_idx++) { |
479 | b->go[go_idx].go.store(i: next_go); |
480 | } |
481 | // Fence added so that workers can see changes to go. sfence inadequate. |
482 | KMP_MFENCE(); |
483 | } |
484 | |
485 | #if KMP_BARRIER_ICV_PUSH |
486 | if (propagate_icvs) { // copy ICVs to final dest |
487 | __kmp_init_implicit_task(loc_ref: team->t.t_ident, this_thr: team->t.t_threads[tid], team, |
488 | tid, FALSE); |
489 | copy_icvs(dst: &team->t.t_implicit_task_taskdata[tid].td_icvs, |
490 | src: (kmp_internal_control_t *)team->t.b->team_icvs); |
491 | copy_icvs(dst: &thr_bar->th_fixed_icvs, |
492 | src: &team->t.t_implicit_task_taskdata[tid].td_icvs); |
493 | } |
494 | #endif |
495 | if (__kmp_dflt_blocktime != KMP_MAX_BLOCKTIME && group_leader) { |
496 | // This thread is now awake and participating in the barrier; |
497 | // wake up the other threads in the group |
498 | size_t nproc = this_thr->th.th_team_nproc; |
499 | size_t group_end = tid + b->threads_per_group; |
500 | if (nproc < group_end) |
501 | group_end = nproc; |
502 | __kmp_dist_barrier_wakeup(bt, team, start: tid + 1, stop: group_end, inc: 1, tid); |
503 | } |
504 | } else { // Primary thread |
505 | team = this_thr->th.th_team; |
506 | b = team->t.b; |
507 | my_current_iter = b->iter[tid].iter; |
508 | next_go = my_current_iter + distributedBarrier::MAX_ITERS; |
509 | #if KMP_BARRIER_ICV_PUSH |
510 | if (propagate_icvs) { |
511 | // primary thread has ICVs in final destination; copy |
512 | copy_icvs(dst: &thr_bar->th_fixed_icvs, |
513 | src: &team->t.t_implicit_task_taskdata[tid].td_icvs); |
514 | } |
515 | #endif |
516 | // Tell all the group leaders they can go! |
517 | for (size_t go_idx = 0; go_idx < b->num_gos; go_idx += b->gos_per_group) { |
518 | b->go[go_idx].go.store(i: next_go); |
519 | } |
520 | |
521 | if (__kmp_dflt_blocktime != KMP_MAX_BLOCKTIME) { |
522 | // Wake-up the group leaders |
523 | size_t nproc = this_thr->th.th_team_nproc; |
524 | __kmp_dist_barrier_wakeup(bt, team, start: tid + b->threads_per_group, stop: nproc, |
525 | inc: b->threads_per_group, tid); |
526 | } |
527 | |
528 | // Tell all the threads in my group they can go! |
529 | for (size_t go_idx = 1; go_idx < b->gos_per_group; go_idx++) { |
530 | b->go[go_idx].go.store(i: next_go); |
531 | } |
532 | |
533 | // Fence added so that workers can see changes to go. sfence inadequate. |
534 | KMP_MFENCE(); |
535 | |
536 | if (__kmp_dflt_blocktime != KMP_MAX_BLOCKTIME) { |
537 | // Wake-up the other threads in my group |
538 | size_t nproc = this_thr->th.th_team_nproc; |
539 | size_t group_end = tid + b->threads_per_group; |
540 | if (nproc < group_end) |
541 | group_end = nproc; |
542 | __kmp_dist_barrier_wakeup(bt, team, start: tid + 1, stop: group_end, inc: 1, tid); |
543 | } |
544 | } |
545 | // Update to next iteration |
546 | KMP_ASSERT(my_current_iter == b->iter[tid].iter); |
547 | b->iter[tid].iter = (b->iter[tid].iter + 1) % distributedBarrier::MAX_ITERS; |
548 | |
549 | KA_TRACE( |
550 | 20, ("__kmp_dist_barrier_release: T#%d(%d:%d) exit for barrier type %d\n" , |
551 | gtid, team->t.t_id, tid, bt)); |
552 | } |
553 | |
554 | // Linear Barrier |
555 | template <bool cancellable = false> |
556 | static bool __kmp_linear_barrier_gather_template( |
557 | enum barrier_type bt, kmp_info_t *this_thr, int gtid, int tid, |
558 | void (*reduce)(void *, void *) USE_ITT_BUILD_ARG(void *itt_sync_obj)) { |
559 | KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_linear_gather); |
560 | kmp_team_t *team = this_thr->th.th_team; |
561 | kmp_bstate_t *thr_bar = &this_thr->th.th_bar[bt].bb; |
562 | kmp_info_t **other_threads = team->t.t_threads; |
563 | |
564 | KA_TRACE( |
565 | 20, |
566 | ("__kmp_linear_barrier_gather: T#%d(%d:%d) enter for barrier type %d\n" , |
567 | gtid, team->t.t_id, tid, bt)); |
568 | KMP_DEBUG_ASSERT(this_thr == other_threads[this_thr->th.th_info.ds.ds_tid]); |
569 | |
570 | #if USE_ITT_BUILD && USE_ITT_NOTIFY |
571 | // Barrier imbalance - save arrive time to the thread |
572 | if (__kmp_forkjoin_frames_mode == 3 || __kmp_forkjoin_frames_mode == 2) { |
573 | this_thr->th.th_bar_arrive_time = this_thr->th.th_bar_min_time = |
574 | __itt_get_timestamp(); |
575 | } |
576 | #endif |
577 | // We now perform a linear reduction to signal that all of the threads have |
578 | // arrived. |
579 | if (!KMP_MASTER_TID(tid)) { |
580 | KA_TRACE(20, |
581 | ("__kmp_linear_barrier_gather: T#%d(%d:%d) releasing T#%d(%d:%d)" |
582 | "arrived(%p): %llu => %llu\n" , |
583 | gtid, team->t.t_id, tid, __kmp_gtid_from_tid(0, team), |
584 | team->t.t_id, 0, &thr_bar->b_arrived, thr_bar->b_arrived, |
585 | thr_bar->b_arrived + KMP_BARRIER_STATE_BUMP)); |
586 | // Mark arrival to primary thread |
587 | /* After performing this write, a worker thread may not assume that the team |
588 | is valid any more - it could be deallocated by the primary thread at any |
589 | time. */ |
590 | kmp_flag_64<> flag(&thr_bar->b_arrived, other_threads[0]); |
591 | flag.release(); |
592 | } else { |
593 | kmp_balign_team_t *team_bar = &team->t.t_bar[bt]; |
594 | int nproc = this_thr->th.th_team_nproc; |
595 | int i; |
596 | // Don't have to worry about sleep bit here or atomic since team setting |
597 | kmp_uint64 new_state = team_bar->b_arrived + KMP_BARRIER_STATE_BUMP; |
598 | |
599 | // Collect all the worker team member threads. |
600 | for (i = 1; i < nproc; ++i) { |
601 | #if KMP_CACHE_MANAGE |
602 | // Prefetch next thread's arrived count |
603 | if (i + 1 < nproc) |
604 | KMP_CACHE_PREFETCH(&other_threads[i + 1]->th.th_bar[bt].bb.b_arrived); |
605 | #endif /* KMP_CACHE_MANAGE */ |
606 | KA_TRACE(20, ("__kmp_linear_barrier_gather: T#%d(%d:%d) wait T#%d(%d:%d) " |
607 | "arrived(%p) == %llu\n" , |
608 | gtid, team->t.t_id, tid, __kmp_gtid_from_tid(i, team), |
609 | team->t.t_id, i, |
610 | &other_threads[i]->th.th_bar[bt].bb.b_arrived, new_state)); |
611 | |
612 | // Wait for worker thread to arrive |
613 | if (cancellable) { |
614 | kmp_flag_64<true, false> flag( |
615 | &other_threads[i]->th.th_bar[bt].bb.b_arrived, new_state); |
616 | if (flag.wait(this_thr, FALSE USE_ITT_BUILD_ARG(itt_sync_obj))) |
617 | return true; |
618 | } else { |
619 | kmp_flag_64<> flag(&other_threads[i]->th.th_bar[bt].bb.b_arrived, |
620 | new_state); |
621 | flag.wait(this_thr, FALSE USE_ITT_BUILD_ARG(itt_sync_obj)); |
622 | } |
623 | #if USE_ITT_BUILD && USE_ITT_NOTIFY |
624 | // Barrier imbalance - write min of the thread time and the other thread |
625 | // time to the thread. |
626 | if (__kmp_forkjoin_frames_mode == 2) { |
627 | this_thr->th.th_bar_min_time = KMP_MIN( |
628 | this_thr->th.th_bar_min_time, other_threads[i]->th.th_bar_min_time); |
629 | } |
630 | #endif |
631 | if (reduce) { |
632 | KA_TRACE(100, |
633 | ("__kmp_linear_barrier_gather: T#%d(%d:%d) += T#%d(%d:%d)\n" , |
634 | gtid, team->t.t_id, tid, __kmp_gtid_from_tid(i, team), |
635 | team->t.t_id, i)); |
636 | OMPT_REDUCTION_DECL(this_thr, gtid); |
637 | OMPT_REDUCTION_BEGIN; |
638 | (*reduce)(this_thr->th.th_local.reduce_data, |
639 | other_threads[i]->th.th_local.reduce_data); |
640 | OMPT_REDUCTION_END; |
641 | } |
642 | } |
643 | // Don't have to worry about sleep bit here or atomic since team setting |
644 | team_bar->b_arrived = new_state; |
645 | KA_TRACE(20, ("__kmp_linear_barrier_gather: T#%d(%d:%d) set team %d " |
646 | "arrived(%p) = %llu\n" , |
647 | gtid, team->t.t_id, tid, team->t.t_id, &team_bar->b_arrived, |
648 | new_state)); |
649 | } |
650 | KA_TRACE( |
651 | 20, |
652 | ("__kmp_linear_barrier_gather: T#%d(%d:%d) exit for barrier type %d\n" , |
653 | gtid, team->t.t_id, tid, bt)); |
654 | return false; |
655 | } |
656 | |
657 | template <bool cancellable = false> |
658 | static bool __kmp_linear_barrier_release_template( |
659 | enum barrier_type bt, kmp_info_t *this_thr, int gtid, int tid, |
660 | int propagate_icvs USE_ITT_BUILD_ARG(void *itt_sync_obj)) { |
661 | KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_linear_release); |
662 | kmp_bstate_t *thr_bar = &this_thr->th.th_bar[bt].bb; |
663 | kmp_team_t *team; |
664 | |
665 | if (KMP_MASTER_TID(tid)) { |
666 | unsigned int i; |
667 | kmp_uint32 nproc = this_thr->th.th_team_nproc; |
668 | kmp_info_t **other_threads; |
669 | |
670 | team = __kmp_threads[gtid]->th.th_team; |
671 | KMP_DEBUG_ASSERT(team != NULL); |
672 | other_threads = team->t.t_threads; |
673 | |
674 | KA_TRACE(20, ("__kmp_linear_barrier_release: T#%d(%d:%d) primary enter for " |
675 | "barrier type %d\n" , |
676 | gtid, team->t.t_id, tid, bt)); |
677 | |
678 | if (nproc > 1) { |
679 | #if KMP_BARRIER_ICV_PUSH |
680 | { |
681 | KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(USER_icv_copy); |
682 | if (propagate_icvs) { |
683 | ngo_load(&team->t.t_implicit_task_taskdata[0].td_icvs); |
684 | for (i = 1; i < nproc; ++i) { |
685 | __kmp_init_implicit_task(loc_ref: team->t.t_ident, this_thr: team->t.t_threads[i], |
686 | team, tid: i, FALSE); |
687 | ngo_store_icvs(&team->t.t_implicit_task_taskdata[i].td_icvs, |
688 | &team->t.t_implicit_task_taskdata[0].td_icvs); |
689 | } |
690 | ngo_sync(); |
691 | } |
692 | } |
693 | #endif // KMP_BARRIER_ICV_PUSH |
694 | |
695 | // Now, release all of the worker threads |
696 | for (i = 1; i < nproc; ++i) { |
697 | #if KMP_CACHE_MANAGE |
698 | // Prefetch next thread's go flag |
699 | if (i + 1 < nproc) |
700 | KMP_CACHE_PREFETCH(&other_threads[i + 1]->th.th_bar[bt].bb.b_go); |
701 | #endif /* KMP_CACHE_MANAGE */ |
702 | KA_TRACE( |
703 | 20, |
704 | ("__kmp_linear_barrier_release: T#%d(%d:%d) releasing T#%d(%d:%d) " |
705 | "go(%p): %u => %u\n" , |
706 | gtid, team->t.t_id, tid, other_threads[i]->th.th_info.ds.ds_gtid, |
707 | team->t.t_id, i, &other_threads[i]->th.th_bar[bt].bb.b_go, |
708 | other_threads[i]->th.th_bar[bt].bb.b_go, |
709 | other_threads[i]->th.th_bar[bt].bb.b_go + KMP_BARRIER_STATE_BUMP)); |
710 | kmp_flag_64<> flag(&other_threads[i]->th.th_bar[bt].bb.b_go, |
711 | other_threads[i]); |
712 | flag.release(); |
713 | } |
714 | } |
715 | } else { // Wait for the PRIMARY thread to release us |
716 | KA_TRACE(20, ("__kmp_linear_barrier_release: T#%d wait go(%p) == %u\n" , |
717 | gtid, &thr_bar->b_go, KMP_BARRIER_STATE_BUMP)); |
718 | if (cancellable) { |
719 | kmp_flag_64<true, false> flag(&thr_bar->b_go, KMP_BARRIER_STATE_BUMP); |
720 | if (flag.wait(this_thr, TRUE USE_ITT_BUILD_ARG(itt_sync_obj))) |
721 | return true; |
722 | } else { |
723 | kmp_flag_64<> flag(&thr_bar->b_go, KMP_BARRIER_STATE_BUMP); |
724 | flag.wait(this_thr, TRUE USE_ITT_BUILD_ARG(itt_sync_obj)); |
725 | } |
726 | #if USE_ITT_BUILD && USE_ITT_NOTIFY |
727 | if ((__itt_sync_create_ptr && itt_sync_obj == NULL) || KMP_ITT_DEBUG) { |
728 | // In a fork barrier; cannot get the object reliably (or ITTNOTIFY is |
729 | // disabled) |
730 | itt_sync_obj = __kmp_itt_barrier_object(gtid, bt: bs_forkjoin_barrier, set_name: 0, delta: -1); |
731 | // Cancel wait on previous parallel region... |
732 | __kmp_itt_task_starting(object: itt_sync_obj); |
733 | |
734 | if (bt == bs_forkjoin_barrier && TCR_4(__kmp_global.g.g_done)) |
735 | return false; |
736 | |
737 | itt_sync_obj = __kmp_itt_barrier_object(gtid, bt: bs_forkjoin_barrier); |
738 | if (itt_sync_obj != NULL) |
739 | // Call prepare as early as possible for "new" barrier |
740 | __kmp_itt_task_finished(object: itt_sync_obj); |
741 | } else |
742 | #endif /* USE_ITT_BUILD && USE_ITT_NOTIFY */ |
743 | // Early exit for reaping threads releasing forkjoin barrier |
744 | if (bt == bs_forkjoin_barrier && TCR_4(__kmp_global.g.g_done)) |
745 | return false; |
746 | // The worker thread may now assume that the team is valid. |
747 | #ifdef KMP_DEBUG |
748 | tid = __kmp_tid_from_gtid(gtid); |
749 | team = __kmp_threads[gtid]->th.th_team; |
750 | #endif |
751 | KMP_DEBUG_ASSERT(team != NULL); |
752 | TCW_4(thr_bar->b_go, KMP_INIT_BARRIER_STATE); |
753 | KA_TRACE(20, |
754 | ("__kmp_linear_barrier_release: T#%d(%d:%d) set go(%p) = %u\n" , |
755 | gtid, team->t.t_id, tid, &thr_bar->b_go, KMP_INIT_BARRIER_STATE)); |
756 | KMP_MB(); // Flush all pending memory write invalidates. |
757 | } |
758 | KA_TRACE( |
759 | 20, |
760 | ("__kmp_linear_barrier_release: T#%d(%d:%d) exit for barrier type %d\n" , |
761 | gtid, team->t.t_id, tid, bt)); |
762 | return false; |
763 | } |
764 | |
765 | static void __kmp_linear_barrier_gather( |
766 | enum barrier_type bt, kmp_info_t *this_thr, int gtid, int tid, |
767 | void (*reduce)(void *, void *) USE_ITT_BUILD_ARG(void *itt_sync_obj)) { |
768 | __kmp_linear_barrier_gather_template<false>( |
769 | bt, this_thr, gtid, tid, reduce USE_ITT_BUILD_ARG(itt_sync_obj)); |
770 | } |
771 | |
772 | static bool __kmp_linear_barrier_gather_cancellable( |
773 | enum barrier_type bt, kmp_info_t *this_thr, int gtid, int tid, |
774 | void (*reduce)(void *, void *) USE_ITT_BUILD_ARG(void *itt_sync_obj)) { |
775 | return __kmp_linear_barrier_gather_template<true>( |
776 | bt, this_thr, gtid, tid, reduce USE_ITT_BUILD_ARG(itt_sync_obj)); |
777 | } |
778 | |
779 | static void __kmp_linear_barrier_release( |
780 | enum barrier_type bt, kmp_info_t *this_thr, int gtid, int tid, |
781 | int propagate_icvs USE_ITT_BUILD_ARG(void *itt_sync_obj)) { |
782 | __kmp_linear_barrier_release_template<false>( |
783 | bt, this_thr, gtid, tid, propagate_icvs USE_ITT_BUILD_ARG(itt_sync_obj)); |
784 | } |
785 | |
786 | static bool __kmp_linear_barrier_release_cancellable( |
787 | enum barrier_type bt, kmp_info_t *this_thr, int gtid, int tid, |
788 | int propagate_icvs USE_ITT_BUILD_ARG(void *itt_sync_obj)) { |
789 | return __kmp_linear_barrier_release_template<true>( |
790 | bt, this_thr, gtid, tid, propagate_icvs USE_ITT_BUILD_ARG(itt_sync_obj)); |
791 | } |
792 | |
793 | // Tree barrier |
794 | static void __kmp_tree_barrier_gather( |
795 | enum barrier_type bt, kmp_info_t *this_thr, int gtid, int tid, |
796 | void (*reduce)(void *, void *) USE_ITT_BUILD_ARG(void *itt_sync_obj)) { |
797 | KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_tree_gather); |
798 | kmp_team_t *team = this_thr->th.th_team; |
799 | kmp_bstate_t *thr_bar = &this_thr->th.th_bar[bt].bb; |
800 | kmp_info_t **other_threads = team->t.t_threads; |
801 | kmp_uint32 nproc = this_thr->th.th_team_nproc; |
802 | kmp_uint32 branch_bits = __kmp_barrier_gather_branch_bits[bt]; |
803 | kmp_uint32 branch_factor = 1 << branch_bits; |
804 | kmp_uint32 child; |
805 | kmp_uint32 child_tid; |
806 | kmp_uint64 new_state = 0; |
807 | |
808 | KA_TRACE( |
809 | 20, ("__kmp_tree_barrier_gather: T#%d(%d:%d) enter for barrier type %d\n" , |
810 | gtid, team->t.t_id, tid, bt)); |
811 | KMP_DEBUG_ASSERT(this_thr == other_threads[this_thr->th.th_info.ds.ds_tid]); |
812 | |
813 | #if USE_ITT_BUILD && USE_ITT_NOTIFY |
814 | // Barrier imbalance - save arrive time to the thread |
815 | if (__kmp_forkjoin_frames_mode == 3 || __kmp_forkjoin_frames_mode == 2) { |
816 | this_thr->th.th_bar_arrive_time = this_thr->th.th_bar_min_time = |
817 | __itt_get_timestamp(); |
818 | } |
819 | #endif |
820 | // Perform tree gather to wait until all threads have arrived; reduce any |
821 | // required data as we go |
822 | child_tid = (tid << branch_bits) + 1; |
823 | if (child_tid < nproc) { |
824 | // Parent threads wait for all their children to arrive |
825 | new_state = team->t.t_bar[bt].b_arrived + KMP_BARRIER_STATE_BUMP; |
826 | child = 1; |
827 | do { |
828 | kmp_info_t *child_thr = other_threads[child_tid]; |
829 | kmp_bstate_t *child_bar = &child_thr->th.th_bar[bt].bb; |
830 | #if KMP_CACHE_MANAGE |
831 | // Prefetch next thread's arrived count |
832 | if (child + 1 <= branch_factor && child_tid + 1 < nproc) |
833 | KMP_CACHE_PREFETCH( |
834 | &other_threads[child_tid + 1]->th.th_bar[bt].bb.b_arrived); |
835 | #endif /* KMP_CACHE_MANAGE */ |
836 | KA_TRACE(20, |
837 | ("__kmp_tree_barrier_gather: T#%d(%d:%d) wait T#%d(%d:%u) " |
838 | "arrived(%p) == %llu\n" , |
839 | gtid, team->t.t_id, tid, __kmp_gtid_from_tid(child_tid, team), |
840 | team->t.t_id, child_tid, &child_bar->b_arrived, new_state)); |
841 | // Wait for child to arrive |
842 | kmp_flag_64<> flag(&child_bar->b_arrived, new_state); |
843 | flag.wait(this_thr, FALSE USE_ITT_BUILD_ARG(itt_sync_obj)); |
844 | #if USE_ITT_BUILD && USE_ITT_NOTIFY |
845 | // Barrier imbalance - write min of the thread time and a child time to |
846 | // the thread. |
847 | if (__kmp_forkjoin_frames_mode == 2) { |
848 | this_thr->th.th_bar_min_time = KMP_MIN(this_thr->th.th_bar_min_time, |
849 | child_thr->th.th_bar_min_time); |
850 | } |
851 | #endif |
852 | if (reduce) { |
853 | KA_TRACE(100, |
854 | ("__kmp_tree_barrier_gather: T#%d(%d:%d) += T#%d(%d:%u)\n" , |
855 | gtid, team->t.t_id, tid, __kmp_gtid_from_tid(child_tid, team), |
856 | team->t.t_id, child_tid)); |
857 | OMPT_REDUCTION_DECL(this_thr, gtid); |
858 | OMPT_REDUCTION_BEGIN; |
859 | (*reduce)(this_thr->th.th_local.reduce_data, |
860 | child_thr->th.th_local.reduce_data); |
861 | OMPT_REDUCTION_END; |
862 | } |
863 | child++; |
864 | child_tid++; |
865 | } while (child <= branch_factor && child_tid < nproc); |
866 | } |
867 | |
868 | if (!KMP_MASTER_TID(tid)) { // Worker threads |
869 | kmp_int32 parent_tid = (tid - 1) >> branch_bits; |
870 | |
871 | KA_TRACE(20, |
872 | ("__kmp_tree_barrier_gather: T#%d(%d:%d) releasing T#%d(%d:%d) " |
873 | "arrived(%p): %llu => %llu\n" , |
874 | gtid, team->t.t_id, tid, __kmp_gtid_from_tid(parent_tid, team), |
875 | team->t.t_id, parent_tid, &thr_bar->b_arrived, thr_bar->b_arrived, |
876 | thr_bar->b_arrived + KMP_BARRIER_STATE_BUMP)); |
877 | |
878 | // Mark arrival to parent thread |
879 | /* After performing this write, a worker thread may not assume that the team |
880 | is valid any more - it could be deallocated by the primary thread at any |
881 | time. */ |
882 | kmp_flag_64<> flag(&thr_bar->b_arrived, other_threads[parent_tid]); |
883 | flag.release(); |
884 | } else { |
885 | // Need to update the team arrived pointer if we are the primary thread |
886 | if (nproc > 1) // New value was already computed above |
887 | team->t.t_bar[bt].b_arrived = new_state; |
888 | else |
889 | team->t.t_bar[bt].b_arrived += KMP_BARRIER_STATE_BUMP; |
890 | KA_TRACE(20, ("__kmp_tree_barrier_gather: T#%d(%d:%d) set team %d " |
891 | "arrived(%p) = %llu\n" , |
892 | gtid, team->t.t_id, tid, team->t.t_id, |
893 | &team->t.t_bar[bt].b_arrived, team->t.t_bar[bt].b_arrived)); |
894 | } |
895 | KA_TRACE(20, |
896 | ("__kmp_tree_barrier_gather: T#%d(%d:%d) exit for barrier type %d\n" , |
897 | gtid, team->t.t_id, tid, bt)); |
898 | } |
899 | |
900 | static void __kmp_tree_barrier_release( |
901 | enum barrier_type bt, kmp_info_t *this_thr, int gtid, int tid, |
902 | int propagate_icvs USE_ITT_BUILD_ARG(void *itt_sync_obj)) { |
903 | KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_tree_release); |
904 | kmp_team_t *team; |
905 | kmp_bstate_t *thr_bar = &this_thr->th.th_bar[bt].bb; |
906 | kmp_uint32 nproc; |
907 | kmp_uint32 branch_bits = __kmp_barrier_release_branch_bits[bt]; |
908 | kmp_uint32 branch_factor = 1 << branch_bits; |
909 | kmp_uint32 child; |
910 | kmp_uint32 child_tid; |
911 | |
912 | // Perform a tree release for all of the threads that have been gathered |
913 | if (!KMP_MASTER_TID( |
914 | tid)) { // Handle fork barrier workers who aren't part of a team yet |
915 | KA_TRACE(20, ("__kmp_tree_barrier_release: T#%d wait go(%p) == %u\n" , gtid, |
916 | &thr_bar->b_go, KMP_BARRIER_STATE_BUMP)); |
917 | // Wait for parent thread to release us |
918 | kmp_flag_64<> flag(&thr_bar->b_go, KMP_BARRIER_STATE_BUMP); |
919 | flag.wait(this_thr, TRUE USE_ITT_BUILD_ARG(itt_sync_obj)); |
920 | #if USE_ITT_BUILD && USE_ITT_NOTIFY |
921 | if ((__itt_sync_create_ptr && itt_sync_obj == NULL) || KMP_ITT_DEBUG) { |
922 | // In fork barrier where we could not get the object reliably (or |
923 | // ITTNOTIFY is disabled) |
924 | itt_sync_obj = __kmp_itt_barrier_object(gtid, bt: bs_forkjoin_barrier, set_name: 0, delta: -1); |
925 | // Cancel wait on previous parallel region... |
926 | __kmp_itt_task_starting(object: itt_sync_obj); |
927 | |
928 | if (bt == bs_forkjoin_barrier && TCR_4(__kmp_global.g.g_done)) |
929 | return; |
930 | |
931 | itt_sync_obj = __kmp_itt_barrier_object(gtid, bt: bs_forkjoin_barrier); |
932 | if (itt_sync_obj != NULL) |
933 | // Call prepare as early as possible for "new" barrier |
934 | __kmp_itt_task_finished(object: itt_sync_obj); |
935 | } else |
936 | #endif /* USE_ITT_BUILD && USE_ITT_NOTIFY */ |
937 | // Early exit for reaping threads releasing forkjoin barrier |
938 | if (bt == bs_forkjoin_barrier && TCR_4(__kmp_global.g.g_done)) |
939 | return; |
940 | |
941 | // The worker thread may now assume that the team is valid. |
942 | team = __kmp_threads[gtid]->th.th_team; |
943 | KMP_DEBUG_ASSERT(team != NULL); |
944 | tid = __kmp_tid_from_gtid(gtid); |
945 | |
946 | TCW_4(thr_bar->b_go, KMP_INIT_BARRIER_STATE); |
947 | KA_TRACE(20, |
948 | ("__kmp_tree_barrier_release: T#%d(%d:%d) set go(%p) = %u\n" , gtid, |
949 | team->t.t_id, tid, &thr_bar->b_go, KMP_INIT_BARRIER_STATE)); |
950 | KMP_MB(); // Flush all pending memory write invalidates. |
951 | } else { |
952 | team = __kmp_threads[gtid]->th.th_team; |
953 | KMP_DEBUG_ASSERT(team != NULL); |
954 | KA_TRACE(20, ("__kmp_tree_barrier_release: T#%d(%d:%d) primary enter for " |
955 | "barrier type %d\n" , |
956 | gtid, team->t.t_id, tid, bt)); |
957 | } |
958 | nproc = this_thr->th.th_team_nproc; |
959 | child_tid = (tid << branch_bits) + 1; |
960 | |
961 | if (child_tid < nproc) { |
962 | kmp_info_t **other_threads = team->t.t_threads; |
963 | child = 1; |
964 | // Parent threads release all their children |
965 | do { |
966 | kmp_info_t *child_thr = other_threads[child_tid]; |
967 | kmp_bstate_t *child_bar = &child_thr->th.th_bar[bt].bb; |
968 | #if KMP_CACHE_MANAGE |
969 | // Prefetch next thread's go count |
970 | if (child + 1 <= branch_factor && child_tid + 1 < nproc) |
971 | KMP_CACHE_PREFETCH( |
972 | &other_threads[child_tid + 1]->th.th_bar[bt].bb.b_go); |
973 | #endif /* KMP_CACHE_MANAGE */ |
974 | |
975 | #if KMP_BARRIER_ICV_PUSH |
976 | { |
977 | KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(USER_icv_copy); |
978 | if (propagate_icvs) { |
979 | __kmp_init_implicit_task(loc_ref: team->t.t_ident, |
980 | this_thr: team->t.t_threads[child_tid], team, |
981 | tid: child_tid, FALSE); |
982 | copy_icvs(dst: &team->t.t_implicit_task_taskdata[child_tid].td_icvs, |
983 | src: &team->t.t_implicit_task_taskdata[0].td_icvs); |
984 | } |
985 | } |
986 | #endif // KMP_BARRIER_ICV_PUSH |
987 | KA_TRACE(20, |
988 | ("__kmp_tree_barrier_release: T#%d(%d:%d) releasing T#%d(%d:%u)" |
989 | "go(%p): %u => %u\n" , |
990 | gtid, team->t.t_id, tid, __kmp_gtid_from_tid(child_tid, team), |
991 | team->t.t_id, child_tid, &child_bar->b_go, child_bar->b_go, |
992 | child_bar->b_go + KMP_BARRIER_STATE_BUMP)); |
993 | // Release child from barrier |
994 | kmp_flag_64<> flag(&child_bar->b_go, child_thr); |
995 | flag.release(); |
996 | child++; |
997 | child_tid++; |
998 | } while (child <= branch_factor && child_tid < nproc); |
999 | } |
1000 | KA_TRACE( |
1001 | 20, ("__kmp_tree_barrier_release: T#%d(%d:%d) exit for barrier type %d\n" , |
1002 | gtid, team->t.t_id, tid, bt)); |
1003 | } |
1004 | |
1005 | // Hyper Barrier |
1006 | static void __kmp_hyper_barrier_gather( |
1007 | enum barrier_type bt, kmp_info_t *this_thr, int gtid, int tid, |
1008 | void (*reduce)(void *, void *) USE_ITT_BUILD_ARG(void *itt_sync_obj)) { |
1009 | KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_hyper_gather); |
1010 | kmp_team_t *team = this_thr->th.th_team; |
1011 | kmp_bstate_t *thr_bar = &this_thr->th.th_bar[bt].bb; |
1012 | kmp_info_t **other_threads = team->t.t_threads; |
1013 | kmp_uint64 new_state = KMP_BARRIER_UNUSED_STATE; |
1014 | kmp_uint32 num_threads = this_thr->th.th_team_nproc; |
1015 | kmp_uint32 branch_bits = __kmp_barrier_gather_branch_bits[bt]; |
1016 | kmp_uint32 branch_factor = 1 << branch_bits; |
1017 | kmp_uint32 offset; |
1018 | kmp_uint32 level; |
1019 | |
1020 | KA_TRACE( |
1021 | 20, |
1022 | ("__kmp_hyper_barrier_gather: T#%d(%d:%d) enter for barrier type %d\n" , |
1023 | gtid, team->t.t_id, tid, bt)); |
1024 | KMP_DEBUG_ASSERT(this_thr == other_threads[this_thr->th.th_info.ds.ds_tid]); |
1025 | |
1026 | #if USE_ITT_BUILD && USE_ITT_NOTIFY |
1027 | // Barrier imbalance - save arrive time to the thread |
1028 | if (__kmp_forkjoin_frames_mode == 3 || __kmp_forkjoin_frames_mode == 2) { |
1029 | this_thr->th.th_bar_arrive_time = this_thr->th.th_bar_min_time = |
1030 | __itt_get_timestamp(); |
1031 | } |
1032 | #endif |
1033 | /* Perform a hypercube-embedded tree gather to wait until all of the threads |
1034 | have arrived, and reduce any required data as we go. */ |
1035 | kmp_flag_64<> p_flag(&thr_bar->b_arrived); |
1036 | for (level = 0, offset = 1; offset < num_threads; |
1037 | level += branch_bits, offset <<= branch_bits) { |
1038 | kmp_uint32 child; |
1039 | kmp_uint32 child_tid; |
1040 | |
1041 | if (((tid >> level) & (branch_factor - 1)) != 0) { |
1042 | kmp_int32 parent_tid = tid & ~((1 << (level + branch_bits)) - 1); |
1043 | |
1044 | KMP_MB(); // Synchronize parent and child threads. |
1045 | KA_TRACE(20, |
1046 | ("__kmp_hyper_barrier_gather: T#%d(%d:%d) releasing T#%d(%d:%d) " |
1047 | "arrived(%p): %llu => %llu\n" , |
1048 | gtid, team->t.t_id, tid, __kmp_gtid_from_tid(parent_tid, team), |
1049 | team->t.t_id, parent_tid, &thr_bar->b_arrived, |
1050 | thr_bar->b_arrived, |
1051 | thr_bar->b_arrived + KMP_BARRIER_STATE_BUMP)); |
1052 | // Mark arrival to parent thread |
1053 | /* After performing this write (in the last iteration of the enclosing for |
1054 | loop), a worker thread may not assume that the team is valid any more |
1055 | - it could be deallocated by the primary thread at any time. */ |
1056 | p_flag.set_waiter(other_threads[parent_tid]); |
1057 | p_flag.release(); |
1058 | break; |
1059 | } |
1060 | |
1061 | // Parent threads wait for children to arrive |
1062 | if (new_state == KMP_BARRIER_UNUSED_STATE) |
1063 | new_state = team->t.t_bar[bt].b_arrived + KMP_BARRIER_STATE_BUMP; |
1064 | for (child = 1, child_tid = tid + (1 << level); |
1065 | child < branch_factor && child_tid < num_threads; |
1066 | child++, child_tid += (1 << level)) { |
1067 | kmp_info_t *child_thr = other_threads[child_tid]; |
1068 | kmp_bstate_t *child_bar = &child_thr->th.th_bar[bt].bb; |
1069 | #if KMP_CACHE_MANAGE |
1070 | kmp_uint32 next_child_tid = child_tid + (1 << level); |
1071 | // Prefetch next thread's arrived count |
1072 | if (child + 1 < branch_factor && next_child_tid < num_threads) |
1073 | KMP_CACHE_PREFETCH( |
1074 | &other_threads[next_child_tid]->th.th_bar[bt].bb.b_arrived); |
1075 | #endif /* KMP_CACHE_MANAGE */ |
1076 | KA_TRACE(20, |
1077 | ("__kmp_hyper_barrier_gather: T#%d(%d:%d) wait T#%d(%d:%u) " |
1078 | "arrived(%p) == %llu\n" , |
1079 | gtid, team->t.t_id, tid, __kmp_gtid_from_tid(child_tid, team), |
1080 | team->t.t_id, child_tid, &child_bar->b_arrived, new_state)); |
1081 | // Wait for child to arrive |
1082 | kmp_flag_64<> c_flag(&child_bar->b_arrived, new_state); |
1083 | c_flag.wait(this_thr, FALSE USE_ITT_BUILD_ARG(itt_sync_obj)); |
1084 | KMP_MB(); // Synchronize parent and child threads. |
1085 | #if USE_ITT_BUILD && USE_ITT_NOTIFY |
1086 | // Barrier imbalance - write min of the thread time and a child time to |
1087 | // the thread. |
1088 | if (__kmp_forkjoin_frames_mode == 2) { |
1089 | this_thr->th.th_bar_min_time = KMP_MIN(this_thr->th.th_bar_min_time, |
1090 | child_thr->th.th_bar_min_time); |
1091 | } |
1092 | #endif |
1093 | if (reduce) { |
1094 | KA_TRACE(100, |
1095 | ("__kmp_hyper_barrier_gather: T#%d(%d:%d) += T#%d(%d:%u)\n" , |
1096 | gtid, team->t.t_id, tid, __kmp_gtid_from_tid(child_tid, team), |
1097 | team->t.t_id, child_tid)); |
1098 | OMPT_REDUCTION_DECL(this_thr, gtid); |
1099 | OMPT_REDUCTION_BEGIN; |
1100 | (*reduce)(this_thr->th.th_local.reduce_data, |
1101 | child_thr->th.th_local.reduce_data); |
1102 | OMPT_REDUCTION_END; |
1103 | } |
1104 | } |
1105 | } |
1106 | |
1107 | if (KMP_MASTER_TID(tid)) { |
1108 | // Need to update the team arrived pointer if we are the primary thread |
1109 | if (new_state == KMP_BARRIER_UNUSED_STATE) |
1110 | team->t.t_bar[bt].b_arrived += KMP_BARRIER_STATE_BUMP; |
1111 | else |
1112 | team->t.t_bar[bt].b_arrived = new_state; |
1113 | KA_TRACE(20, ("__kmp_hyper_barrier_gather: T#%d(%d:%d) set team %d " |
1114 | "arrived(%p) = %llu\n" , |
1115 | gtid, team->t.t_id, tid, team->t.t_id, |
1116 | &team->t.t_bar[bt].b_arrived, team->t.t_bar[bt].b_arrived)); |
1117 | } |
1118 | KA_TRACE( |
1119 | 20, ("__kmp_hyper_barrier_gather: T#%d(%d:%d) exit for barrier type %d\n" , |
1120 | gtid, team->t.t_id, tid, bt)); |
1121 | } |
1122 | |
1123 | // The reverse versions seem to beat the forward versions overall |
1124 | #define KMP_REVERSE_HYPER_BAR |
1125 | static void __kmp_hyper_barrier_release( |
1126 | enum barrier_type bt, kmp_info_t *this_thr, int gtid, int tid, |
1127 | int propagate_icvs USE_ITT_BUILD_ARG(void *itt_sync_obj)) { |
1128 | KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_hyper_release); |
1129 | kmp_team_t *team; |
1130 | kmp_bstate_t *thr_bar = &this_thr->th.th_bar[bt].bb; |
1131 | kmp_info_t **other_threads; |
1132 | kmp_uint32 num_threads; |
1133 | kmp_uint32 branch_bits = __kmp_barrier_release_branch_bits[bt]; |
1134 | kmp_uint32 branch_factor = 1 << branch_bits; |
1135 | kmp_uint32 child; |
1136 | kmp_uint32 child_tid; |
1137 | kmp_uint32 offset; |
1138 | kmp_uint32 level; |
1139 | |
1140 | /* Perform a hypercube-embedded tree release for all of the threads that have |
1141 | been gathered. If KMP_REVERSE_HYPER_BAR is defined (default) the threads |
1142 | are released in the reverse order of the corresponding gather, otherwise |
1143 | threads are released in the same order. */ |
1144 | if (KMP_MASTER_TID(tid)) { // primary thread |
1145 | team = __kmp_threads[gtid]->th.th_team; |
1146 | KMP_DEBUG_ASSERT(team != NULL); |
1147 | KA_TRACE(20, ("__kmp_hyper_barrier_release: T#%d(%d:%d) primary enter for " |
1148 | "barrier type %d\n" , |
1149 | gtid, team->t.t_id, tid, bt)); |
1150 | #if KMP_BARRIER_ICV_PUSH |
1151 | if (propagate_icvs) { // primary already has ICVs in final destination; copy |
1152 | copy_icvs(dst: &thr_bar->th_fixed_icvs, |
1153 | src: &team->t.t_implicit_task_taskdata[tid].td_icvs); |
1154 | } |
1155 | #endif |
1156 | } else { // Handle fork barrier workers who aren't part of a team yet |
1157 | KA_TRACE(20, ("__kmp_hyper_barrier_release: T#%d wait go(%p) == %u\n" , gtid, |
1158 | &thr_bar->b_go, KMP_BARRIER_STATE_BUMP)); |
1159 | // Wait for parent thread to release us |
1160 | kmp_flag_64<> flag(&thr_bar->b_go, KMP_BARRIER_STATE_BUMP); |
1161 | flag.wait(this_thr, TRUE USE_ITT_BUILD_ARG(itt_sync_obj)); |
1162 | #if USE_ITT_BUILD && USE_ITT_NOTIFY |
1163 | if ((__itt_sync_create_ptr && itt_sync_obj == NULL) || KMP_ITT_DEBUG) { |
1164 | // In fork barrier where we could not get the object reliably |
1165 | itt_sync_obj = __kmp_itt_barrier_object(gtid, bt: bs_forkjoin_barrier, set_name: 0, delta: -1); |
1166 | // Cancel wait on previous parallel region... |
1167 | __kmp_itt_task_starting(object: itt_sync_obj); |
1168 | |
1169 | if (bt == bs_forkjoin_barrier && TCR_4(__kmp_global.g.g_done)) |
1170 | return; |
1171 | |
1172 | itt_sync_obj = __kmp_itt_barrier_object(gtid, bt: bs_forkjoin_barrier); |
1173 | if (itt_sync_obj != NULL) |
1174 | // Call prepare as early as possible for "new" barrier |
1175 | __kmp_itt_task_finished(object: itt_sync_obj); |
1176 | } else |
1177 | #endif /* USE_ITT_BUILD && USE_ITT_NOTIFY */ |
1178 | // Early exit for reaping threads releasing forkjoin barrier |
1179 | if (bt == bs_forkjoin_barrier && TCR_4(__kmp_global.g.g_done)) |
1180 | return; |
1181 | |
1182 | // The worker thread may now assume that the team is valid. |
1183 | team = __kmp_threads[gtid]->th.th_team; |
1184 | KMP_DEBUG_ASSERT(team != NULL); |
1185 | tid = __kmp_tid_from_gtid(gtid); |
1186 | |
1187 | TCW_4(thr_bar->b_go, KMP_INIT_BARRIER_STATE); |
1188 | KA_TRACE(20, |
1189 | ("__kmp_hyper_barrier_release: T#%d(%d:%d) set go(%p) = %u\n" , |
1190 | gtid, team->t.t_id, tid, &thr_bar->b_go, KMP_INIT_BARRIER_STATE)); |
1191 | KMP_MB(); // Flush all pending memory write invalidates. |
1192 | } |
1193 | num_threads = this_thr->th.th_team_nproc; |
1194 | other_threads = team->t.t_threads; |
1195 | |
1196 | #ifdef KMP_REVERSE_HYPER_BAR |
1197 | // Count up to correct level for parent |
1198 | for (level = 0, offset = 1; |
1199 | offset < num_threads && (((tid >> level) & (branch_factor - 1)) == 0); |
1200 | level += branch_bits, offset <<= branch_bits) |
1201 | ; |
1202 | |
1203 | // Now go down from there |
1204 | for (level -= branch_bits, offset >>= branch_bits; offset != 0; |
1205 | level -= branch_bits, offset >>= branch_bits) |
1206 | #else |
1207 | // Go down the tree, level by level |
1208 | for (level = 0, offset = 1; offset < num_threads; |
1209 | level += branch_bits, offset <<= branch_bits) |
1210 | #endif // KMP_REVERSE_HYPER_BAR |
1211 | { |
1212 | #ifdef KMP_REVERSE_HYPER_BAR |
1213 | /* Now go in reverse order through the children, highest to lowest. |
1214 | Initial setting of child is conservative here. */ |
1215 | child = num_threads >> ((level == 0) ? level : level - 1); |
1216 | for (child = (child < branch_factor - 1) ? child : branch_factor - 1, |
1217 | child_tid = tid + (child << level); |
1218 | child >= 1; child--, child_tid -= (1 << level)) |
1219 | #else |
1220 | if (((tid >> level) & (branch_factor - 1)) != 0) |
1221 | // No need to go lower than this, since this is the level parent would be |
1222 | // notified |
1223 | break; |
1224 | // Iterate through children on this level of the tree |
1225 | for (child = 1, child_tid = tid + (1 << level); |
1226 | child < branch_factor && child_tid < num_threads; |
1227 | child++, child_tid += (1 << level)) |
1228 | #endif // KMP_REVERSE_HYPER_BAR |
1229 | { |
1230 | if (child_tid >= num_threads) |
1231 | continue; // Child doesn't exist so keep going |
1232 | else { |
1233 | kmp_info_t *child_thr = other_threads[child_tid]; |
1234 | kmp_bstate_t *child_bar = &child_thr->th.th_bar[bt].bb; |
1235 | #if KMP_CACHE_MANAGE |
1236 | kmp_uint32 next_child_tid = child_tid - (1 << level); |
1237 | // Prefetch next thread's go count |
1238 | #ifdef KMP_REVERSE_HYPER_BAR |
1239 | if (child - 1 >= 1 && next_child_tid < num_threads) |
1240 | #else |
1241 | if (child + 1 < branch_factor && next_child_tid < num_threads) |
1242 | #endif // KMP_REVERSE_HYPER_BAR |
1243 | KMP_CACHE_PREFETCH( |
1244 | &other_threads[next_child_tid]->th.th_bar[bt].bb.b_go); |
1245 | #endif /* KMP_CACHE_MANAGE */ |
1246 | |
1247 | #if KMP_BARRIER_ICV_PUSH |
1248 | if (propagate_icvs) // push my fixed ICVs to my child |
1249 | copy_icvs(dst: &child_bar->th_fixed_icvs, src: &thr_bar->th_fixed_icvs); |
1250 | #endif // KMP_BARRIER_ICV_PUSH |
1251 | |
1252 | KA_TRACE( |
1253 | 20, |
1254 | ("__kmp_hyper_barrier_release: T#%d(%d:%d) releasing T#%d(%d:%u)" |
1255 | "go(%p): %u => %u\n" , |
1256 | gtid, team->t.t_id, tid, __kmp_gtid_from_tid(child_tid, team), |
1257 | team->t.t_id, child_tid, &child_bar->b_go, child_bar->b_go, |
1258 | child_bar->b_go + KMP_BARRIER_STATE_BUMP)); |
1259 | // Release child from barrier |
1260 | kmp_flag_64<> flag(&child_bar->b_go, child_thr); |
1261 | flag.release(); |
1262 | } |
1263 | } |
1264 | } |
1265 | #if KMP_BARRIER_ICV_PUSH |
1266 | if (propagate_icvs && |
1267 | !KMP_MASTER_TID(tid)) { // copy ICVs locally to final dest |
1268 | __kmp_init_implicit_task(loc_ref: team->t.t_ident, this_thr: team->t.t_threads[tid], team, tid, |
1269 | FALSE); |
1270 | copy_icvs(dst: &team->t.t_implicit_task_taskdata[tid].td_icvs, |
1271 | src: &thr_bar->th_fixed_icvs); |
1272 | } |
1273 | #endif |
1274 | KA_TRACE( |
1275 | 20, |
1276 | ("__kmp_hyper_barrier_release: T#%d(%d:%d) exit for barrier type %d\n" , |
1277 | gtid, team->t.t_id, tid, bt)); |
1278 | } |
1279 | |
1280 | // Hierarchical Barrier |
1281 | |
1282 | // Initialize thread barrier data |
1283 | /* Initializes/re-initializes the hierarchical barrier data stored on a thread. |
1284 | Performs the minimum amount of initialization required based on how the team |
1285 | has changed. Returns true if leaf children will require both on-core and |
1286 | traditional wake-up mechanisms. For example, if the team size increases, |
1287 | threads already in the team will respond to on-core wakeup on their parent |
1288 | thread, but threads newly added to the team will only be listening on the |
1289 | their local b_go. */ |
1290 | static bool __kmp_init_hierarchical_barrier_thread(enum barrier_type bt, |
1291 | kmp_bstate_t *thr_bar, |
1292 | kmp_uint32 nproc, int gtid, |
1293 | int tid, kmp_team_t *team) { |
1294 | // Checks to determine if (re-)initialization is needed |
1295 | bool uninitialized = thr_bar->team == NULL; |
1296 | bool team_changed = team != thr_bar->team; |
1297 | bool team_sz_changed = nproc != thr_bar->nproc; |
1298 | bool tid_changed = tid != thr_bar->old_tid; |
1299 | bool retval = false; |
1300 | |
1301 | if (uninitialized || team_sz_changed) { |
1302 | __kmp_get_hierarchy(nproc, thr_bar); |
1303 | } |
1304 | |
1305 | if (uninitialized || team_sz_changed || tid_changed) { |
1306 | thr_bar->my_level = thr_bar->depth - 1; // default for primary thread |
1307 | thr_bar->parent_tid = -1; // default for primary thread |
1308 | if (!KMP_MASTER_TID(tid)) { |
1309 | // if not primary thread, find parent thread in hierarchy |
1310 | kmp_uint32 d = 0; |
1311 | while (d < thr_bar->depth) { // find parent based on level of thread in |
1312 | // hierarchy, and note level |
1313 | kmp_uint32 rem; |
1314 | if (d == thr_bar->depth - 2) { // reached level right below the primary |
1315 | thr_bar->parent_tid = 0; |
1316 | thr_bar->my_level = d; |
1317 | break; |
1318 | } else if ((rem = tid % thr_bar->skip_per_level[d + 1]) != 0) { |
1319 | // TODO: can we make the above op faster? |
1320 | // thread is not a subtree root at next level, so this is max |
1321 | thr_bar->parent_tid = tid - rem; |
1322 | thr_bar->my_level = d; |
1323 | break; |
1324 | } |
1325 | ++d; |
1326 | } |
1327 | } |
1328 | __kmp_type_convert(src: 7 - ((tid - thr_bar->parent_tid) / |
1329 | (thr_bar->skip_per_level[thr_bar->my_level])), |
1330 | dest: &(thr_bar->offset)); |
1331 | thr_bar->old_tid = tid; |
1332 | thr_bar->wait_flag = KMP_BARRIER_NOT_WAITING; |
1333 | thr_bar->team = team; |
1334 | thr_bar->parent_bar = |
1335 | &team->t.t_threads[thr_bar->parent_tid]->th.th_bar[bt].bb; |
1336 | } |
1337 | if (uninitialized || team_changed || tid_changed) { |
1338 | thr_bar->team = team; |
1339 | thr_bar->parent_bar = |
1340 | &team->t.t_threads[thr_bar->parent_tid]->th.th_bar[bt].bb; |
1341 | retval = true; |
1342 | } |
1343 | if (uninitialized || team_sz_changed || tid_changed) { |
1344 | thr_bar->nproc = nproc; |
1345 | thr_bar->leaf_kids = thr_bar->base_leaf_kids; |
1346 | if (thr_bar->my_level == 0) |
1347 | thr_bar->leaf_kids = 0; |
1348 | if (thr_bar->leaf_kids && (kmp_uint32)tid + thr_bar->leaf_kids + 1 > nproc) |
1349 | __kmp_type_convert(src: nproc - tid - 1, dest: &(thr_bar->leaf_kids)); |
1350 | thr_bar->leaf_state = 0; |
1351 | for (int i = 0; i < thr_bar->leaf_kids; ++i) |
1352 | ((char *)&(thr_bar->leaf_state))[7 - i] = 1; |
1353 | } |
1354 | return retval; |
1355 | } |
1356 | |
1357 | static void __kmp_hierarchical_barrier_gather( |
1358 | enum barrier_type bt, kmp_info_t *this_thr, int gtid, int tid, |
1359 | void (*reduce)(void *, void *) USE_ITT_BUILD_ARG(void *itt_sync_obj)) { |
1360 | KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_hier_gather); |
1361 | kmp_team_t *team = this_thr->th.th_team; |
1362 | kmp_bstate_t *thr_bar = &this_thr->th.th_bar[bt].bb; |
1363 | kmp_uint32 nproc = this_thr->th.th_team_nproc; |
1364 | kmp_info_t **other_threads = team->t.t_threads; |
1365 | kmp_uint64 new_state = 0; |
1366 | |
1367 | int level = team->t.t_level; |
1368 | if (other_threads[0] |
1369 | ->th.th_teams_microtask) // are we inside the teams construct? |
1370 | if (this_thr->th.th_teams_size.nteams > 1) |
1371 | ++level; // level was not increased in teams construct for team_of_masters |
1372 | if (level == 1) |
1373 | thr_bar->use_oncore_barrier = 1; |
1374 | else |
1375 | thr_bar->use_oncore_barrier = 0; // Do not use oncore barrier when nested |
1376 | |
1377 | KA_TRACE(20, ("__kmp_hierarchical_barrier_gather: T#%d(%d:%d) enter for " |
1378 | "barrier type %d\n" , |
1379 | gtid, team->t.t_id, tid, bt)); |
1380 | KMP_DEBUG_ASSERT(this_thr == other_threads[this_thr->th.th_info.ds.ds_tid]); |
1381 | |
1382 | #if USE_ITT_BUILD && USE_ITT_NOTIFY |
1383 | // Barrier imbalance - save arrive time to the thread |
1384 | if (__kmp_forkjoin_frames_mode == 3 || __kmp_forkjoin_frames_mode == 2) { |
1385 | this_thr->th.th_bar_arrive_time = __itt_get_timestamp(); |
1386 | } |
1387 | #endif |
1388 | |
1389 | (void)__kmp_init_hierarchical_barrier_thread(bt, thr_bar, nproc, gtid, tid, |
1390 | team); |
1391 | |
1392 | if (thr_bar->my_level) { // not a leaf (my_level==0 means leaf) |
1393 | kmp_int32 child_tid; |
1394 | new_state = |
1395 | (kmp_uint64)team->t.t_bar[bt].b_arrived + KMP_BARRIER_STATE_BUMP; |
1396 | if (__kmp_dflt_blocktime == KMP_MAX_BLOCKTIME && |
1397 | thr_bar->use_oncore_barrier) { |
1398 | if (thr_bar->leaf_kids) { |
1399 | // First, wait for leaf children to check-in on my b_arrived flag |
1400 | kmp_uint64 leaf_state = |
1401 | KMP_MASTER_TID(tid) |
1402 | ? thr_bar->b_arrived | thr_bar->leaf_state |
1403 | : team->t.t_bar[bt].b_arrived | thr_bar->leaf_state; |
1404 | KA_TRACE(20, ("__kmp_hierarchical_barrier_gather: T#%d(%d:%d) waiting " |
1405 | "for leaf kids\n" , |
1406 | gtid, team->t.t_id, tid)); |
1407 | kmp_flag_64<> flag(&thr_bar->b_arrived, leaf_state); |
1408 | flag.wait(this_thr, FALSE USE_ITT_BUILD_ARG(itt_sync_obj)); |
1409 | if (reduce) { |
1410 | OMPT_REDUCTION_DECL(this_thr, gtid); |
1411 | OMPT_REDUCTION_BEGIN; |
1412 | for (child_tid = tid + 1; child_tid <= tid + thr_bar->leaf_kids; |
1413 | ++child_tid) { |
1414 | KA_TRACE(100, ("__kmp_hierarchical_barrier_gather: T#%d(%d:%d) += " |
1415 | "T#%d(%d:%d)\n" , |
1416 | gtid, team->t.t_id, tid, |
1417 | __kmp_gtid_from_tid(child_tid, team), team->t.t_id, |
1418 | child_tid)); |
1419 | (*reduce)(this_thr->th.th_local.reduce_data, |
1420 | other_threads[child_tid]->th.th_local.reduce_data); |
1421 | } |
1422 | OMPT_REDUCTION_END; |
1423 | } |
1424 | // clear leaf_state bits |
1425 | KMP_TEST_THEN_AND64(&thr_bar->b_arrived, ~(thr_bar->leaf_state)); |
1426 | } |
1427 | // Next, wait for higher level children on each child's b_arrived flag |
1428 | for (kmp_uint32 d = 1; d < thr_bar->my_level; |
1429 | ++d) { // gather lowest level threads first, but skip 0 |
1430 | kmp_uint32 last = tid + thr_bar->skip_per_level[d + 1], |
1431 | skip = thr_bar->skip_per_level[d]; |
1432 | if (last > nproc) |
1433 | last = nproc; |
1434 | for (child_tid = tid + skip; child_tid < (int)last; child_tid += skip) { |
1435 | kmp_info_t *child_thr = other_threads[child_tid]; |
1436 | kmp_bstate_t *child_bar = &child_thr->th.th_bar[bt].bb; |
1437 | KA_TRACE(20, ("__kmp_hierarchical_barrier_gather: T#%d(%d:%d) wait " |
1438 | "T#%d(%d:%d) " |
1439 | "arrived(%p) == %llu\n" , |
1440 | gtid, team->t.t_id, tid, |
1441 | __kmp_gtid_from_tid(child_tid, team), team->t.t_id, |
1442 | child_tid, &child_bar->b_arrived, new_state)); |
1443 | kmp_flag_64<> flag(&child_bar->b_arrived, new_state); |
1444 | flag.wait(this_thr, FALSE USE_ITT_BUILD_ARG(itt_sync_obj)); |
1445 | if (reduce) { |
1446 | KA_TRACE(100, ("__kmp_hierarchical_barrier_gather: T#%d(%d:%d) += " |
1447 | "T#%d(%d:%d)\n" , |
1448 | gtid, team->t.t_id, tid, |
1449 | __kmp_gtid_from_tid(child_tid, team), team->t.t_id, |
1450 | child_tid)); |
1451 | (*reduce)(this_thr->th.th_local.reduce_data, |
1452 | child_thr->th.th_local.reduce_data); |
1453 | } |
1454 | } |
1455 | } |
1456 | } else { // Blocktime is not infinite |
1457 | for (kmp_uint32 d = 0; d < thr_bar->my_level; |
1458 | ++d) { // Gather lowest level threads first |
1459 | kmp_uint32 last = tid + thr_bar->skip_per_level[d + 1], |
1460 | skip = thr_bar->skip_per_level[d]; |
1461 | if (last > nproc) |
1462 | last = nproc; |
1463 | for (child_tid = tid + skip; child_tid < (int)last; child_tid += skip) { |
1464 | kmp_info_t *child_thr = other_threads[child_tid]; |
1465 | kmp_bstate_t *child_bar = &child_thr->th.th_bar[bt].bb; |
1466 | KA_TRACE(20, ("__kmp_hierarchical_barrier_gather: T#%d(%d:%d) wait " |
1467 | "T#%d(%d:%d) " |
1468 | "arrived(%p) == %llu\n" , |
1469 | gtid, team->t.t_id, tid, |
1470 | __kmp_gtid_from_tid(child_tid, team), team->t.t_id, |
1471 | child_tid, &child_bar->b_arrived, new_state)); |
1472 | kmp_flag_64<> flag(&child_bar->b_arrived, new_state); |
1473 | flag.wait(this_thr, FALSE USE_ITT_BUILD_ARG(itt_sync_obj)); |
1474 | if (reduce) { |
1475 | KA_TRACE(100, ("__kmp_hierarchical_barrier_gather: T#%d(%d:%d) += " |
1476 | "T#%d(%d:%d)\n" , |
1477 | gtid, team->t.t_id, tid, |
1478 | __kmp_gtid_from_tid(child_tid, team), team->t.t_id, |
1479 | child_tid)); |
1480 | (*reduce)(this_thr->th.th_local.reduce_data, |
1481 | child_thr->th.th_local.reduce_data); |
1482 | } |
1483 | } |
1484 | } |
1485 | } |
1486 | } |
1487 | // All subordinates are gathered; now release parent if not primary thread |
1488 | |
1489 | if (!KMP_MASTER_TID(tid)) { // worker threads release parent in hierarchy |
1490 | KA_TRACE(20, ("__kmp_hierarchical_barrier_gather: T#%d(%d:%d) releasing" |
1491 | " T#%d(%d:%d) arrived(%p): %llu => %llu\n" , |
1492 | gtid, team->t.t_id, tid, |
1493 | __kmp_gtid_from_tid(thr_bar->parent_tid, team), team->t.t_id, |
1494 | thr_bar->parent_tid, &thr_bar->b_arrived, thr_bar->b_arrived, |
1495 | thr_bar->b_arrived + KMP_BARRIER_STATE_BUMP)); |
1496 | /* Mark arrival to parent: After performing this write, a worker thread may |
1497 | not assume that the team is valid any more - it could be deallocated by |
1498 | the primary thread at any time. */ |
1499 | if (thr_bar->my_level || __kmp_dflt_blocktime != KMP_MAX_BLOCKTIME || |
1500 | !thr_bar->use_oncore_barrier) { // Parent is waiting on my b_arrived |
1501 | // flag; release it |
1502 | kmp_flag_64<> flag(&thr_bar->b_arrived, |
1503 | other_threads[thr_bar->parent_tid]); |
1504 | flag.release(); |
1505 | } else { |
1506 | // Leaf does special release on "offset" bits of parent's b_arrived flag |
1507 | thr_bar->b_arrived = team->t.t_bar[bt].b_arrived + KMP_BARRIER_STATE_BUMP; |
1508 | kmp_flag_oncore flag(&thr_bar->parent_bar->b_arrived, |
1509 | thr_bar->offset + 1); |
1510 | flag.set_waiter(other_threads[thr_bar->parent_tid]); |
1511 | flag.release(); |
1512 | } |
1513 | } else { // Primary thread needs to update the team's b_arrived value |
1514 | team->t.t_bar[bt].b_arrived = new_state; |
1515 | KA_TRACE(20, ("__kmp_hierarchical_barrier_gather: T#%d(%d:%d) set team %d " |
1516 | "arrived(%p) = %llu\n" , |
1517 | gtid, team->t.t_id, tid, team->t.t_id, |
1518 | &team->t.t_bar[bt].b_arrived, team->t.t_bar[bt].b_arrived)); |
1519 | } |
1520 | // Is the team access below unsafe or just technically invalid? |
1521 | KA_TRACE(20, ("__kmp_hierarchical_barrier_gather: T#%d(%d:%d) exit for " |
1522 | "barrier type %d\n" , |
1523 | gtid, team->t.t_id, tid, bt)); |
1524 | } |
1525 | |
1526 | static void __kmp_hierarchical_barrier_release( |
1527 | enum barrier_type bt, kmp_info_t *this_thr, int gtid, int tid, |
1528 | int propagate_icvs USE_ITT_BUILD_ARG(void *itt_sync_obj)) { |
1529 | KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_hier_release); |
1530 | kmp_team_t *team; |
1531 | kmp_bstate_t *thr_bar = &this_thr->th.th_bar[bt].bb; |
1532 | kmp_uint32 nproc; |
1533 | bool team_change = false; // indicates on-core barrier shouldn't be used |
1534 | |
1535 | if (KMP_MASTER_TID(tid)) { |
1536 | team = __kmp_threads[gtid]->th.th_team; |
1537 | KMP_DEBUG_ASSERT(team != NULL); |
1538 | KA_TRACE(20, ("__kmp_hierarchical_barrier_release: T#%d(%d:%d) primary " |
1539 | "entered barrier type %d\n" , |
1540 | gtid, team->t.t_id, tid, bt)); |
1541 | } else { // Worker threads |
1542 | // Wait for parent thread to release me |
1543 | if (!thr_bar->use_oncore_barrier || |
1544 | __kmp_dflt_blocktime != KMP_MAX_BLOCKTIME || thr_bar->my_level != 0 || |
1545 | thr_bar->team == NULL) { |
1546 | // Use traditional method of waiting on my own b_go flag |
1547 | thr_bar->wait_flag = KMP_BARRIER_OWN_FLAG; |
1548 | kmp_flag_64<> flag(&thr_bar->b_go, KMP_BARRIER_STATE_BUMP); |
1549 | flag.wait(this_thr, TRUE USE_ITT_BUILD_ARG(itt_sync_obj)); |
1550 | TCW_8(thr_bar->b_go, |
1551 | KMP_INIT_BARRIER_STATE); // Reset my b_go flag for next time |
1552 | } else { // Thread barrier data is initialized, this is a leaf, blocktime is |
1553 | // infinite, not nested |
1554 | // Wait on my "offset" bits on parent's b_go flag |
1555 | thr_bar->wait_flag = KMP_BARRIER_PARENT_FLAG; |
1556 | kmp_flag_oncore flag(&thr_bar->parent_bar->b_go, KMP_BARRIER_STATE_BUMP, |
1557 | thr_bar->offset + 1, bt, |
1558 | this_thr USE_ITT_BUILD_ARG(itt_sync_obj)); |
1559 | flag.wait(this_thr, TRUE); |
1560 | if (thr_bar->wait_flag == |
1561 | KMP_BARRIER_SWITCHING) { // Thread was switched to own b_go |
1562 | TCW_8(thr_bar->b_go, |
1563 | KMP_INIT_BARRIER_STATE); // Reset my b_go flag for next time |
1564 | } else { // Reset my bits on parent's b_go flag |
1565 | (RCAST(volatile char *, |
1566 | &(thr_bar->parent_bar->b_go)))[thr_bar->offset + 1] = 0; |
1567 | } |
1568 | } |
1569 | thr_bar->wait_flag = KMP_BARRIER_NOT_WAITING; |
1570 | // Early exit for reaping threads releasing forkjoin barrier |
1571 | if (bt == bs_forkjoin_barrier && TCR_4(__kmp_global.g.g_done)) |
1572 | return; |
1573 | // The worker thread may now assume that the team is valid. |
1574 | team = __kmp_threads[gtid]->th.th_team; |
1575 | KMP_DEBUG_ASSERT(team != NULL); |
1576 | tid = __kmp_tid_from_gtid(gtid); |
1577 | |
1578 | KA_TRACE( |
1579 | 20, |
1580 | ("__kmp_hierarchical_barrier_release: T#%d(%d:%d) set go(%p) = %u\n" , |
1581 | gtid, team->t.t_id, tid, &thr_bar->b_go, KMP_INIT_BARRIER_STATE)); |
1582 | KMP_MB(); // Flush all pending memory write invalidates. |
1583 | } |
1584 | |
1585 | nproc = this_thr->th.th_team_nproc; |
1586 | int level = team->t.t_level; |
1587 | if (team->t.t_threads[0] |
1588 | ->th.th_teams_microtask) { // are we inside the teams construct? |
1589 | if (team->t.t_pkfn != (microtask_t)__kmp_teams_master && |
1590 | this_thr->th.th_teams_level == level) |
1591 | ++level; // level was not increased in teams construct for team_of_workers |
1592 | if (this_thr->th.th_teams_size.nteams > 1) |
1593 | ++level; // level was not increased in teams construct for team_of_masters |
1594 | } |
1595 | if (level == 1) |
1596 | thr_bar->use_oncore_barrier = 1; |
1597 | else |
1598 | thr_bar->use_oncore_barrier = 0; // Do not use oncore barrier when nested |
1599 | |
1600 | // If the team size has increased, we still communicate with old leaves via |
1601 | // oncore barrier. |
1602 | unsigned short int old_leaf_kids = thr_bar->leaf_kids; |
1603 | kmp_uint64 old_leaf_state = thr_bar->leaf_state; |
1604 | team_change = __kmp_init_hierarchical_barrier_thread(bt, thr_bar, nproc, gtid, |
1605 | tid, team); |
1606 | // But if the entire team changes, we won't use oncore barrier at all |
1607 | if (team_change) |
1608 | old_leaf_kids = 0; |
1609 | |
1610 | #if KMP_BARRIER_ICV_PUSH |
1611 | if (propagate_icvs) { |
1612 | __kmp_init_implicit_task(loc_ref: team->t.t_ident, this_thr: team->t.t_threads[tid], team, tid, |
1613 | FALSE); |
1614 | if (KMP_MASTER_TID( |
1615 | tid)) { // primary already has copy in final destination; copy |
1616 | copy_icvs(dst: &thr_bar->th_fixed_icvs, |
1617 | src: &team->t.t_implicit_task_taskdata[tid].td_icvs); |
1618 | } else if (__kmp_dflt_blocktime == KMP_MAX_BLOCKTIME && |
1619 | thr_bar->use_oncore_barrier) { // optimization for inf blocktime |
1620 | if (!thr_bar->my_level) // I'm a leaf in the hierarchy (my_level==0) |
1621 | // leaves (on-core children) pull parent's fixed ICVs directly to local |
1622 | // ICV store |
1623 | copy_icvs(dst: &team->t.t_implicit_task_taskdata[tid].td_icvs, |
1624 | src: &thr_bar->parent_bar->th_fixed_icvs); |
1625 | // non-leaves will get ICVs piggybacked with b_go via NGO store |
1626 | } else { // blocktime is not infinite; pull ICVs from parent's fixed ICVs |
1627 | if (thr_bar->my_level) // not a leaf; copy ICVs to my fixed ICVs child can |
1628 | // access |
1629 | copy_icvs(dst: &thr_bar->th_fixed_icvs, src: &thr_bar->parent_bar->th_fixed_icvs); |
1630 | else // leaves copy parent's fixed ICVs directly to local ICV store |
1631 | copy_icvs(dst: &team->t.t_implicit_task_taskdata[tid].td_icvs, |
1632 | src: &thr_bar->parent_bar->th_fixed_icvs); |
1633 | } |
1634 | } |
1635 | #endif // KMP_BARRIER_ICV_PUSH |
1636 | |
1637 | // Now, release my children |
1638 | if (thr_bar->my_level) { // not a leaf |
1639 | kmp_int32 child_tid; |
1640 | kmp_uint32 last; |
1641 | if (__kmp_dflt_blocktime == KMP_MAX_BLOCKTIME && |
1642 | thr_bar->use_oncore_barrier) { |
1643 | if (KMP_MASTER_TID(tid)) { // do a flat release |
1644 | // Set local b_go to bump children via NGO store of the cache line |
1645 | // containing IVCs and b_go. |
1646 | thr_bar->b_go = KMP_BARRIER_STATE_BUMP; |
1647 | // Use ngo stores if available; b_go piggybacks in the last 8 bytes of |
1648 | // the cache line |
1649 | ngo_load(&thr_bar->th_fixed_icvs); |
1650 | // This loops over all the threads skipping only the leaf nodes in the |
1651 | // hierarchy |
1652 | for (child_tid = thr_bar->skip_per_level[1]; child_tid < (int)nproc; |
1653 | child_tid += thr_bar->skip_per_level[1]) { |
1654 | kmp_bstate_t *child_bar = |
1655 | &team->t.t_threads[child_tid]->th.th_bar[bt].bb; |
1656 | KA_TRACE(20, ("__kmp_hierarchical_barrier_release: T#%d(%d:%d) " |
1657 | "releasing T#%d(%d:%d)" |
1658 | " go(%p): %u => %u\n" , |
1659 | gtid, team->t.t_id, tid, |
1660 | __kmp_gtid_from_tid(child_tid, team), team->t.t_id, |
1661 | child_tid, &child_bar->b_go, child_bar->b_go, |
1662 | child_bar->b_go + KMP_BARRIER_STATE_BUMP)); |
1663 | // Use ngo store (if available) to both store ICVs and release child |
1664 | // via child's b_go |
1665 | ngo_store_go(&child_bar->th_fixed_icvs, &thr_bar->th_fixed_icvs); |
1666 | } |
1667 | ngo_sync(); |
1668 | } |
1669 | TCW_8(thr_bar->b_go, |
1670 | KMP_INIT_BARRIER_STATE); // Reset my b_go flag for next time |
1671 | // Now, release leaf children |
1672 | if (thr_bar->leaf_kids) { // if there are any |
1673 | // We test team_change on the off-chance that the level 1 team changed. |
1674 | if (team_change || |
1675 | old_leaf_kids < thr_bar->leaf_kids) { // some old, some new |
1676 | if (old_leaf_kids) { // release old leaf kids |
1677 | thr_bar->b_go |= old_leaf_state; |
1678 | } |
1679 | // Release new leaf kids |
1680 | last = tid + thr_bar->skip_per_level[1]; |
1681 | if (last > nproc) |
1682 | last = nproc; |
1683 | for (child_tid = tid + 1 + old_leaf_kids; child_tid < (int)last; |
1684 | ++child_tid) { // skip_per_level[0]=1 |
1685 | kmp_info_t *child_thr = team->t.t_threads[child_tid]; |
1686 | kmp_bstate_t *child_bar = &child_thr->th.th_bar[bt].bb; |
1687 | KA_TRACE( |
1688 | 20, |
1689 | ("__kmp_hierarchical_barrier_release: T#%d(%d:%d) releasing" |
1690 | " T#%d(%d:%d) go(%p): %u => %u\n" , |
1691 | gtid, team->t.t_id, tid, __kmp_gtid_from_tid(child_tid, team), |
1692 | team->t.t_id, child_tid, &child_bar->b_go, child_bar->b_go, |
1693 | child_bar->b_go + KMP_BARRIER_STATE_BUMP)); |
1694 | // Release child using child's b_go flag |
1695 | kmp_flag_64<> flag(&child_bar->b_go, child_thr); |
1696 | flag.release(); |
1697 | } |
1698 | } else { // Release all children at once with leaf_state bits on my own |
1699 | // b_go flag |
1700 | thr_bar->b_go |= thr_bar->leaf_state; |
1701 | } |
1702 | } |
1703 | } else { // Blocktime is not infinite; do a simple hierarchical release |
1704 | for (int d = thr_bar->my_level - 1; d >= 0; |
1705 | --d) { // Release highest level threads first |
1706 | last = tid + thr_bar->skip_per_level[d + 1]; |
1707 | kmp_uint32 skip = thr_bar->skip_per_level[d]; |
1708 | if (last > nproc) |
1709 | last = nproc; |
1710 | for (child_tid = tid + skip; child_tid < (int)last; child_tid += skip) { |
1711 | kmp_info_t *child_thr = team->t.t_threads[child_tid]; |
1712 | kmp_bstate_t *child_bar = &child_thr->th.th_bar[bt].bb; |
1713 | KA_TRACE(20, ("__kmp_hierarchical_barrier_release: T#%d(%d:%d) " |
1714 | "releasing T#%d(%d:%d) go(%p): %u => %u\n" , |
1715 | gtid, team->t.t_id, tid, |
1716 | __kmp_gtid_from_tid(child_tid, team), team->t.t_id, |
1717 | child_tid, &child_bar->b_go, child_bar->b_go, |
1718 | child_bar->b_go + KMP_BARRIER_STATE_BUMP)); |
1719 | // Release child using child's b_go flag |
1720 | kmp_flag_64<> flag(&child_bar->b_go, child_thr); |
1721 | flag.release(); |
1722 | } |
1723 | } |
1724 | } |
1725 | #if KMP_BARRIER_ICV_PUSH |
1726 | if (propagate_icvs && !KMP_MASTER_TID(tid)) |
1727 | // non-leaves copy ICVs from fixed ICVs to local dest |
1728 | copy_icvs(dst: &team->t.t_implicit_task_taskdata[tid].td_icvs, |
1729 | src: &thr_bar->th_fixed_icvs); |
1730 | #endif // KMP_BARRIER_ICV_PUSH |
1731 | } |
1732 | KA_TRACE(20, ("__kmp_hierarchical_barrier_release: T#%d(%d:%d) exit for " |
1733 | "barrier type %d\n" , |
1734 | gtid, team->t.t_id, tid, bt)); |
1735 | } |
1736 | |
1737 | // End of Barrier Algorithms |
1738 | |
1739 | // type traits for cancellable value |
1740 | // if cancellable is true, then is_cancellable is a normal boolean variable |
1741 | // if cancellable is false, then is_cancellable is a compile time constant |
1742 | template <bool cancellable> struct is_cancellable {}; |
1743 | template <> struct is_cancellable<true> { |
1744 | bool value; |
1745 | is_cancellable() : value(false) {} |
1746 | is_cancellable(bool b) : value(b) {} |
1747 | is_cancellable &operator=(bool b) { |
1748 | value = b; |
1749 | return *this; |
1750 | } |
1751 | operator bool() const { return value; } |
1752 | }; |
1753 | template <> struct is_cancellable<false> { |
1754 | is_cancellable &operator=(bool b) { return *this; } |
1755 | constexpr operator bool() const { return false; } |
1756 | }; |
1757 | |
1758 | // Internal function to do a barrier. |
1759 | /* If is_split is true, do a split barrier, otherwise, do a plain barrier |
1760 | If reduce is non-NULL, do a split reduction barrier, otherwise, do a split |
1761 | barrier |
1762 | When cancellable = false, |
1763 | Returns 0 if primary thread, 1 if worker thread. |
1764 | When cancellable = true |
1765 | Returns 0 if not cancelled, 1 if cancelled. */ |
1766 | template <bool cancellable = false> |
1767 | static int __kmp_barrier_template(enum barrier_type bt, int gtid, int is_split, |
1768 | size_t reduce_size, void *reduce_data, |
1769 | void (*reduce)(void *, void *)) { |
1770 | KMP_TIME_PARTITIONED_BLOCK(OMP_plain_barrier); |
1771 | KMP_SET_THREAD_STATE_BLOCK(PLAIN_BARRIER); |
1772 | int tid = __kmp_tid_from_gtid(gtid); |
1773 | kmp_info_t *this_thr = __kmp_threads[gtid]; |
1774 | kmp_team_t *team = this_thr->th.th_team; |
1775 | int status = 0; |
1776 | is_cancellable<cancellable> cancelled; |
1777 | #if OMPT_SUPPORT && OMPT_OPTIONAL |
1778 | ompt_data_t *my_task_data; |
1779 | ompt_data_t *my_parallel_data; |
1780 | void *return_address; |
1781 | ompt_sync_region_t barrier_kind; |
1782 | #endif |
1783 | |
1784 | KA_TRACE(15, ("__kmp_barrier: T#%d(%d:%d) has arrived\n" , gtid, |
1785 | __kmp_team_from_gtid(gtid)->t.t_id, __kmp_tid_from_gtid(gtid))); |
1786 | |
1787 | #if OMPT_SUPPORT |
1788 | if (ompt_enabled.enabled) { |
1789 | #if OMPT_OPTIONAL |
1790 | my_task_data = OMPT_CUR_TASK_DATA(this_thr); |
1791 | my_parallel_data = OMPT_CUR_TEAM_DATA(this_thr); |
1792 | return_address = OMPT_LOAD_RETURN_ADDRESS(gtid); |
1793 | barrier_kind = __ompt_get_barrier_kind(bt, this_thr); |
1794 | if (ompt_enabled.ompt_callback_sync_region) { |
1795 | ompt_callbacks.ompt_callback(ompt_callback_sync_region)( |
1796 | barrier_kind, ompt_scope_begin, my_parallel_data, my_task_data, |
1797 | return_address); |
1798 | } |
1799 | if (ompt_enabled.ompt_callback_sync_region_wait) { |
1800 | ompt_callbacks.ompt_callback(ompt_callback_sync_region_wait)( |
1801 | barrier_kind, ompt_scope_begin, my_parallel_data, my_task_data, |
1802 | return_address); |
1803 | } |
1804 | #endif |
1805 | // It is OK to report the barrier state after the barrier begin callback. |
1806 | // According to the OMPT specification, a compliant implementation may |
1807 | // even delay reporting this state until the barrier begins to wait. |
1808 | this_thr->th.ompt_thread_info.state = ompt_state_wait_barrier; |
1809 | } |
1810 | #endif |
1811 | |
1812 | if (!team->t.t_serialized) { |
1813 | #if USE_ITT_BUILD |
1814 | // This value will be used in itt notify events below. |
1815 | void *itt_sync_obj = NULL; |
1816 | #if USE_ITT_NOTIFY |
1817 | if (__itt_sync_create_ptr || KMP_ITT_DEBUG) |
1818 | itt_sync_obj = __kmp_itt_barrier_object(gtid, bt, set_name: 1); |
1819 | #endif |
1820 | #endif /* USE_ITT_BUILD */ |
1821 | if (__kmp_tasking_mode == tskm_extra_barrier) { |
1822 | __kmp_tasking_barrier(team, thread: this_thr, gtid); |
1823 | KA_TRACE(15, |
1824 | ("__kmp_barrier: T#%d(%d:%d) past tasking barrier\n" , gtid, |
1825 | __kmp_team_from_gtid(gtid)->t.t_id, __kmp_tid_from_gtid(gtid))); |
1826 | } |
1827 | |
1828 | /* Copy the blocktime info to the thread, where __kmp_wait_template() can |
1829 | access it when the team struct is not guaranteed to exist. */ |
1830 | // See note about the corresponding code in __kmp_join_barrier() being |
1831 | // performance-critical. |
1832 | if (__kmp_dflt_blocktime != KMP_MAX_BLOCKTIME) { |
1833 | #if KMP_USE_MONITOR |
1834 | this_thr->th.th_team_bt_intervals = |
1835 | team->t.t_implicit_task_taskdata[tid].td_icvs.bt_intervals; |
1836 | this_thr->th.th_team_bt_set = |
1837 | team->t.t_implicit_task_taskdata[tid].td_icvs.bt_set; |
1838 | #else |
1839 | this_thr->th.th_team_bt_intervals = KMP_BLOCKTIME_INTERVAL(team, tid); |
1840 | #endif |
1841 | } |
1842 | |
1843 | #if USE_ITT_BUILD |
1844 | if (__itt_sync_create_ptr || KMP_ITT_DEBUG) |
1845 | __kmp_itt_barrier_starting(gtid, object: itt_sync_obj); |
1846 | #endif /* USE_ITT_BUILD */ |
1847 | #if USE_DEBUGGER |
1848 | // Let the debugger know: the thread arrived to the barrier and waiting. |
1849 | if (KMP_MASTER_TID(tid)) { // Primary thread counter stored in team struct |
1850 | team->t.t_bar[bt].b_master_arrived += 1; |
1851 | } else { |
1852 | this_thr->th.th_bar[bt].bb.b_worker_arrived += 1; |
1853 | } // if |
1854 | #endif /* USE_DEBUGGER */ |
1855 | if (reduce != NULL) { |
1856 | // KMP_DEBUG_ASSERT( is_split == TRUE ); // #C69956 |
1857 | this_thr->th.th_local.reduce_data = reduce_data; |
1858 | } |
1859 | |
1860 | if (KMP_MASTER_TID(tid) && __kmp_tasking_mode != tskm_immediate_exec) |
1861 | // use 0 to only setup the current team if nthreads > 1 |
1862 | __kmp_task_team_setup(this_thr, team, always: 0); |
1863 | |
1864 | if (cancellable) { |
1865 | cancelled = __kmp_linear_barrier_gather_cancellable( |
1866 | bt, this_thr, gtid, tid, reduce USE_ITT_BUILD_ARG(itt_sync_obj)); |
1867 | } else { |
1868 | switch (__kmp_barrier_gather_pattern[bt]) { |
1869 | case bp_dist_bar: { |
1870 | __kmp_dist_barrier_gather(bt, this_thr, gtid, tid, |
1871 | reduce USE_ITT_BUILD_ARG(itt_sync_obj)); |
1872 | break; |
1873 | } |
1874 | case bp_hyper_bar: { |
1875 | // don't set branch bits to 0; use linear |
1876 | KMP_ASSERT(__kmp_barrier_gather_branch_bits[bt]); |
1877 | __kmp_hyper_barrier_gather(bt, this_thr, gtid, tid, |
1878 | reduce USE_ITT_BUILD_ARG(itt_sync_obj)); |
1879 | break; |
1880 | } |
1881 | case bp_hierarchical_bar: { |
1882 | __kmp_hierarchical_barrier_gather( |
1883 | bt, this_thr, gtid, tid, reduce USE_ITT_BUILD_ARG(itt_sync_obj)); |
1884 | break; |
1885 | } |
1886 | case bp_tree_bar: { |
1887 | // don't set branch bits to 0; use linear |
1888 | KMP_ASSERT(__kmp_barrier_gather_branch_bits[bt]); |
1889 | __kmp_tree_barrier_gather(bt, this_thr, gtid, tid, |
1890 | reduce USE_ITT_BUILD_ARG(itt_sync_obj)); |
1891 | break; |
1892 | } |
1893 | default: { |
1894 | __kmp_linear_barrier_gather(bt, this_thr, gtid, tid, |
1895 | reduce USE_ITT_BUILD_ARG(itt_sync_obj)); |
1896 | } |
1897 | } |
1898 | } |
1899 | |
1900 | KMP_MB(); |
1901 | |
1902 | if (KMP_MASTER_TID(tid)) { |
1903 | status = 0; |
1904 | if (__kmp_tasking_mode != tskm_immediate_exec && !cancelled) { |
1905 | __kmp_task_team_wait(this_thr, team USE_ITT_BUILD_ARG(itt_sync_obj)); |
1906 | } |
1907 | #if USE_DEBUGGER |
1908 | // Let the debugger know: All threads are arrived and starting leaving the |
1909 | // barrier. |
1910 | team->t.t_bar[bt].b_team_arrived += 1; |
1911 | #endif |
1912 | |
1913 | if (__kmp_omp_cancellation) { |
1914 | kmp_int32 cancel_request = KMP_ATOMIC_LD_RLX(&team->t.t_cancel_request); |
1915 | // Reset cancellation flag for worksharing constructs |
1916 | if (cancel_request == cancel_loop || |
1917 | cancel_request == cancel_sections) { |
1918 | KMP_ATOMIC_ST_RLX(&team->t.t_cancel_request, cancel_noreq); |
1919 | } |
1920 | } |
1921 | #if USE_ITT_BUILD |
1922 | /* TODO: In case of split reduction barrier, primary thread may send |
1923 | acquired event early, before the final summation into the shared |
1924 | variable is done (final summation can be a long operation for array |
1925 | reductions). */ |
1926 | if (__itt_sync_create_ptr || KMP_ITT_DEBUG) |
1927 | __kmp_itt_barrier_middle(gtid, object: itt_sync_obj); |
1928 | #endif /* USE_ITT_BUILD */ |
1929 | #if USE_ITT_BUILD && USE_ITT_NOTIFY |
1930 | // Barrier - report frame end (only if active_level == 1) |
1931 | if ((__itt_frame_submit_v3_ptr || KMP_ITT_DEBUG) && |
1932 | __kmp_forkjoin_frames_mode && |
1933 | (this_thr->th.th_teams_microtask == NULL || // either not in teams |
1934 | this_thr->th.th_teams_size.nteams == 1) && // or inside single team |
1935 | team->t.t_active_level == 1) { |
1936 | ident_t *loc = __kmp_threads[gtid]->th.th_ident; |
1937 | kmp_uint64 cur_time = __itt_get_timestamp(); |
1938 | kmp_info_t **other_threads = team->t.t_threads; |
1939 | int nproc = this_thr->th.th_team_nproc; |
1940 | int i; |
1941 | switch (__kmp_forkjoin_frames_mode) { |
1942 | case 1: |
1943 | __kmp_itt_frame_submit(gtid, begin: this_thr->th.th_frame_time, end: cur_time, imbalance: 0, |
1944 | loc, team_size: nproc); |
1945 | this_thr->th.th_frame_time = cur_time; |
1946 | break; |
1947 | case 2: // AC 2015-01-19: currently does not work for hierarchical (to |
1948 | // be fixed) |
1949 | __kmp_itt_frame_submit(gtid, begin: this_thr->th.th_bar_min_time, end: cur_time, |
1950 | imbalance: 1, loc, team_size: nproc); |
1951 | break; |
1952 | case 3: |
1953 | if (__itt_metadata_add_ptr) { |
1954 | // Initialize with primary thread's wait time |
1955 | kmp_uint64 delta = cur_time - this_thr->th.th_bar_arrive_time; |
1956 | // Set arrive time to zero to be able to check it in |
1957 | // __kmp_invoke_task(); the same is done inside the loop below |
1958 | this_thr->th.th_bar_arrive_time = 0; |
1959 | for (i = 1; i < nproc; ++i) { |
1960 | delta += (cur_time - other_threads[i]->th.th_bar_arrive_time); |
1961 | other_threads[i]->th.th_bar_arrive_time = 0; |
1962 | } |
1963 | __kmp_itt_metadata_imbalance(gtid, begin: this_thr->th.th_frame_time, |
1964 | end: cur_time, imbalance: delta, |
1965 | reduction: (kmp_uint64)(reduce != NULL)); |
1966 | } |
1967 | __kmp_itt_frame_submit(gtid, begin: this_thr->th.th_frame_time, end: cur_time, imbalance: 0, |
1968 | loc, team_size: nproc); |
1969 | this_thr->th.th_frame_time = cur_time; |
1970 | break; |
1971 | } |
1972 | } |
1973 | #endif /* USE_ITT_BUILD */ |
1974 | } else { |
1975 | status = 1; |
1976 | #if USE_ITT_BUILD |
1977 | if (__itt_sync_create_ptr || KMP_ITT_DEBUG) |
1978 | __kmp_itt_barrier_middle(gtid, object: itt_sync_obj); |
1979 | #endif /* USE_ITT_BUILD */ |
1980 | } |
1981 | if ((status == 1 || !is_split) && !cancelled) { |
1982 | if (cancellable) { |
1983 | cancelled = __kmp_linear_barrier_release_cancellable( |
1984 | bt, this_thr, gtid, tid, FALSE USE_ITT_BUILD_ARG(itt_sync_obj)); |
1985 | } else { |
1986 | switch (__kmp_barrier_release_pattern[bt]) { |
1987 | case bp_dist_bar: { |
1988 | KMP_ASSERT(__kmp_barrier_release_branch_bits[bt]); |
1989 | __kmp_dist_barrier_release(bt, this_thr, gtid, tid, |
1990 | FALSE USE_ITT_BUILD_ARG(itt_sync_obj)); |
1991 | break; |
1992 | } |
1993 | case bp_hyper_bar: { |
1994 | KMP_ASSERT(__kmp_barrier_release_branch_bits[bt]); |
1995 | __kmp_hyper_barrier_release(bt, this_thr, gtid, tid, |
1996 | FALSE USE_ITT_BUILD_ARG(itt_sync_obj)); |
1997 | break; |
1998 | } |
1999 | case bp_hierarchical_bar: { |
2000 | __kmp_hierarchical_barrier_release( |
2001 | bt, this_thr, gtid, tid, FALSE USE_ITT_BUILD_ARG(itt_sync_obj)); |
2002 | break; |
2003 | } |
2004 | case bp_tree_bar: { |
2005 | KMP_ASSERT(__kmp_barrier_release_branch_bits[bt]); |
2006 | __kmp_tree_barrier_release(bt, this_thr, gtid, tid, |
2007 | FALSE USE_ITT_BUILD_ARG(itt_sync_obj)); |
2008 | break; |
2009 | } |
2010 | default: { |
2011 | __kmp_linear_barrier_release(bt, this_thr, gtid, tid, |
2012 | FALSE USE_ITT_BUILD_ARG(itt_sync_obj)); |
2013 | } |
2014 | } |
2015 | } |
2016 | if (__kmp_tasking_mode != tskm_immediate_exec && !cancelled) { |
2017 | __kmp_task_team_sync(this_thr, team); |
2018 | } |
2019 | } |
2020 | |
2021 | #if USE_ITT_BUILD |
2022 | /* GEH: TODO: Move this under if-condition above and also include in |
2023 | __kmp_end_split_barrier(). This will more accurately represent the actual |
2024 | release time of the threads for split barriers. */ |
2025 | if (__itt_sync_create_ptr || KMP_ITT_DEBUG) |
2026 | __kmp_itt_barrier_finished(gtid, object: itt_sync_obj); |
2027 | #endif /* USE_ITT_BUILD */ |
2028 | } else { // Team is serialized. |
2029 | status = 0; |
2030 | if (__kmp_tasking_mode != tskm_immediate_exec) { |
2031 | if (this_thr->th.th_task_team != NULL) { |
2032 | #if USE_ITT_NOTIFY |
2033 | void *itt_sync_obj = NULL; |
2034 | if (__itt_sync_create_ptr || KMP_ITT_DEBUG) { |
2035 | itt_sync_obj = __kmp_itt_barrier_object(gtid, bt, set_name: 1); |
2036 | __kmp_itt_barrier_starting(gtid, object: itt_sync_obj); |
2037 | } |
2038 | #endif |
2039 | |
2040 | KMP_DEBUG_ASSERT( |
2041 | this_thr->th.th_task_team->tt.tt_found_proxy_tasks == TRUE || |
2042 | this_thr->th.th_task_team->tt.tt_hidden_helper_task_encountered == |
2043 | TRUE); |
2044 | __kmp_task_team_wait(this_thr, team USE_ITT_BUILD_ARG(itt_sync_obj)); |
2045 | __kmp_task_team_setup(this_thr, team, always: 0); |
2046 | |
2047 | #if USE_ITT_BUILD |
2048 | if (__itt_sync_create_ptr || KMP_ITT_DEBUG) |
2049 | __kmp_itt_barrier_finished(gtid, object: itt_sync_obj); |
2050 | #endif /* USE_ITT_BUILD */ |
2051 | } |
2052 | } |
2053 | } |
2054 | KA_TRACE(15, ("__kmp_barrier: T#%d(%d:%d) is leaving with return value %d\n" , |
2055 | gtid, __kmp_team_from_gtid(gtid)->t.t_id, |
2056 | __kmp_tid_from_gtid(gtid), status)); |
2057 | |
2058 | #if OMPT_SUPPORT |
2059 | if (ompt_enabled.enabled) { |
2060 | #if OMPT_OPTIONAL |
2061 | if (ompt_enabled.ompt_callback_sync_region_wait) { |
2062 | ompt_callbacks.ompt_callback(ompt_callback_sync_region_wait)( |
2063 | barrier_kind, ompt_scope_end, my_parallel_data, my_task_data, |
2064 | return_address); |
2065 | } |
2066 | if (ompt_enabled.ompt_callback_sync_region) { |
2067 | ompt_callbacks.ompt_callback(ompt_callback_sync_region)( |
2068 | barrier_kind, ompt_scope_end, my_parallel_data, my_task_data, |
2069 | return_address); |
2070 | } |
2071 | #endif |
2072 | this_thr->th.ompt_thread_info.state = ompt_state_work_parallel; |
2073 | } |
2074 | #endif |
2075 | |
2076 | if (cancellable) |
2077 | return (int)cancelled; |
2078 | return status; |
2079 | } |
2080 | |
2081 | // Returns 0 if primary thread, 1 if worker thread. |
2082 | int __kmp_barrier(enum barrier_type bt, int gtid, int is_split, |
2083 | size_t reduce_size, void *reduce_data, |
2084 | void (*reduce)(void *, void *)) { |
2085 | return __kmp_barrier_template<>(bt, gtid, is_split, reduce_size, reduce_data, |
2086 | reduce); |
2087 | } |
2088 | |
2089 | #if defined(KMP_GOMP_COMPAT) |
2090 | // Returns 1 if cancelled, 0 otherwise |
2091 | int __kmp_barrier_gomp_cancel(int gtid) { |
2092 | if (__kmp_omp_cancellation) { |
2093 | int cancelled = __kmp_barrier_template<true>(bt: bs_plain_barrier, gtid, FALSE, |
2094 | reduce_size: 0, NULL, NULL); |
2095 | if (cancelled) { |
2096 | int tid = __kmp_tid_from_gtid(gtid); |
2097 | kmp_info_t *this_thr = __kmp_threads[gtid]; |
2098 | if (KMP_MASTER_TID(tid)) { |
2099 | // Primary thread does not need to revert anything |
2100 | } else { |
2101 | // Workers need to revert their private b_arrived flag |
2102 | this_thr->th.th_bar[bs_plain_barrier].bb.b_arrived -= |
2103 | KMP_BARRIER_STATE_BUMP; |
2104 | } |
2105 | } |
2106 | return cancelled; |
2107 | } |
2108 | __kmp_barrier(bt: bs_plain_barrier, gtid, FALSE, reduce_size: 0, NULL, NULL); |
2109 | return FALSE; |
2110 | } |
2111 | #endif |
2112 | |
2113 | void __kmp_end_split_barrier(enum barrier_type bt, int gtid) { |
2114 | KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_end_split_barrier); |
2115 | KMP_SET_THREAD_STATE_BLOCK(PLAIN_BARRIER); |
2116 | KMP_DEBUG_ASSERT(bt < bs_last_barrier); |
2117 | int tid = __kmp_tid_from_gtid(gtid); |
2118 | kmp_info_t *this_thr = __kmp_threads[gtid]; |
2119 | kmp_team_t *team = this_thr->th.th_team; |
2120 | |
2121 | if (!team->t.t_serialized) { |
2122 | if (KMP_MASTER_GTID(gtid)) { |
2123 | switch (__kmp_barrier_release_pattern[bt]) { |
2124 | case bp_dist_bar: { |
2125 | __kmp_dist_barrier_release(bt, this_thr, gtid, tid, |
2126 | FALSE USE_ITT_BUILD_ARG(NULL)); |
2127 | break; |
2128 | } |
2129 | case bp_hyper_bar: { |
2130 | KMP_ASSERT(__kmp_barrier_release_branch_bits[bt]); |
2131 | __kmp_hyper_barrier_release(bt, this_thr, gtid, tid, |
2132 | FALSE USE_ITT_BUILD_ARG(NULL)); |
2133 | break; |
2134 | } |
2135 | case bp_hierarchical_bar: { |
2136 | __kmp_hierarchical_barrier_release(bt, this_thr, gtid, tid, |
2137 | FALSE USE_ITT_BUILD_ARG(NULL)); |
2138 | break; |
2139 | } |
2140 | case bp_tree_bar: { |
2141 | KMP_ASSERT(__kmp_barrier_release_branch_bits[bt]); |
2142 | __kmp_tree_barrier_release(bt, this_thr, gtid, tid, |
2143 | FALSE USE_ITT_BUILD_ARG(NULL)); |
2144 | break; |
2145 | } |
2146 | default: { |
2147 | __kmp_linear_barrier_release(bt, this_thr, gtid, tid, |
2148 | FALSE USE_ITT_BUILD_ARG(NULL)); |
2149 | } |
2150 | } |
2151 | if (__kmp_tasking_mode != tskm_immediate_exec) { |
2152 | __kmp_task_team_sync(this_thr, team); |
2153 | } // if |
2154 | } |
2155 | } |
2156 | } |
2157 | |
2158 | void __kmp_join_barrier(int gtid) { |
2159 | KMP_TIME_PARTITIONED_BLOCK(OMP_join_barrier); |
2160 | KMP_SET_THREAD_STATE_BLOCK(FORK_JOIN_BARRIER); |
2161 | |
2162 | KMP_DEBUG_ASSERT(__kmp_threads && __kmp_threads[gtid]); |
2163 | |
2164 | kmp_info_t *this_thr = __kmp_threads[gtid]; |
2165 | kmp_team_t *team; |
2166 | int tid; |
2167 | #ifdef KMP_DEBUG |
2168 | int team_id; |
2169 | #endif /* KMP_DEBUG */ |
2170 | #if USE_ITT_BUILD |
2171 | void *itt_sync_obj = NULL; |
2172 | #if USE_ITT_NOTIFY |
2173 | if (__itt_sync_create_ptr || KMP_ITT_DEBUG) // Don't call routine without need |
2174 | // Get object created at fork_barrier |
2175 | itt_sync_obj = __kmp_itt_barrier_object(gtid, bt: bs_forkjoin_barrier); |
2176 | #endif |
2177 | #endif /* USE_ITT_BUILD */ |
2178 | #if ((USE_ITT_BUILD && USE_ITT_NOTIFY) || defined KMP_DEBUG) |
2179 | int nproc = this_thr->th.th_team_nproc; |
2180 | #endif |
2181 | KMP_MB(); |
2182 | |
2183 | // Get current info |
2184 | team = this_thr->th.th_team; |
2185 | KMP_DEBUG_ASSERT(nproc == team->t.t_nproc); |
2186 | tid = __kmp_tid_from_gtid(gtid); |
2187 | #ifdef KMP_DEBUG |
2188 | team_id = team->t.t_id; |
2189 | kmp_info_t *master_thread = this_thr->th.th_team_master; |
2190 | if (master_thread != team->t.t_threads[0]) { |
2191 | __kmp_print_structure(); |
2192 | } |
2193 | #endif /* KMP_DEBUG */ |
2194 | KMP_DEBUG_ASSERT(master_thread == team->t.t_threads[0]); |
2195 | KMP_MB(); |
2196 | |
2197 | // Verify state |
2198 | KMP_DEBUG_ASSERT(TCR_PTR(this_thr->th.th_team)); |
2199 | KMP_DEBUG_ASSERT(TCR_PTR(this_thr->th.th_root)); |
2200 | KMP_DEBUG_ASSERT(this_thr == team->t.t_threads[tid]); |
2201 | KA_TRACE(10, ("__kmp_join_barrier: T#%d(%d:%d) arrived at join barrier\n" , |
2202 | gtid, team_id, tid)); |
2203 | |
2204 | #if OMPT_SUPPORT |
2205 | if (ompt_enabled.enabled) { |
2206 | #if OMPT_OPTIONAL |
2207 | ompt_data_t *my_task_data; |
2208 | ompt_data_t *my_parallel_data; |
2209 | void *codeptr = NULL; |
2210 | int ds_tid = this_thr->th.th_info.ds.ds_tid; |
2211 | if (KMP_MASTER_TID(ds_tid) && |
2212 | (ompt_callbacks.ompt_callback(ompt_callback_sync_region_wait) || |
2213 | ompt_callbacks.ompt_callback(ompt_callback_sync_region))) |
2214 | codeptr = team->t.ompt_team_info.master_return_address; |
2215 | my_task_data = OMPT_CUR_TASK_DATA(this_thr); |
2216 | my_parallel_data = OMPT_CUR_TEAM_DATA(this_thr); |
2217 | if (ompt_enabled.ompt_callback_sync_region) { |
2218 | ompt_callbacks.ompt_callback(ompt_callback_sync_region)( |
2219 | ompt_sync_region_barrier_implicit, ompt_scope_begin, my_parallel_data, |
2220 | my_task_data, codeptr); |
2221 | } |
2222 | if (ompt_enabled.ompt_callback_sync_region_wait) { |
2223 | ompt_callbacks.ompt_callback(ompt_callback_sync_region_wait)( |
2224 | ompt_sync_region_barrier_implicit, ompt_scope_begin, my_parallel_data, |
2225 | my_task_data, codeptr); |
2226 | } |
2227 | if (!KMP_MASTER_TID(ds_tid)) |
2228 | this_thr->th.ompt_thread_info.task_data = *OMPT_CUR_TASK_DATA(this_thr); |
2229 | #endif |
2230 | this_thr->th.ompt_thread_info.state = ompt_state_wait_barrier_implicit; |
2231 | } |
2232 | #endif |
2233 | |
2234 | if (__kmp_tasking_mode == tskm_extra_barrier) { |
2235 | __kmp_tasking_barrier(team, thread: this_thr, gtid); |
2236 | KA_TRACE(10, ("__kmp_join_barrier: T#%d(%d:%d) past tasking barrier\n" , |
2237 | gtid, team_id, tid)); |
2238 | } |
2239 | #ifdef KMP_DEBUG |
2240 | if (__kmp_tasking_mode != tskm_immediate_exec) { |
2241 | KA_TRACE(20, ("__kmp_join_barrier: T#%d, old team = %d, old task_team = " |
2242 | "%p, th_task_team = %p\n" , |
2243 | __kmp_gtid_from_thread(this_thr), team_id, |
2244 | team->t.t_task_team[this_thr->th.th_task_state], |
2245 | this_thr->th.th_task_team)); |
2246 | if (this_thr->th.th_task_team) |
2247 | KMP_DEBUG_ASSERT(this_thr->th.th_task_team == |
2248 | team->t.t_task_team[this_thr->th.th_task_state]); |
2249 | } |
2250 | #endif /* KMP_DEBUG */ |
2251 | |
2252 | /* Copy the blocktime info to the thread, where __kmp_wait_template() can |
2253 | access it when the team struct is not guaranteed to exist. Doing these |
2254 | loads causes a cache miss slows down EPCC parallel by 2x. As a workaround, |
2255 | we do not perform the copy if blocktime=infinite, since the values are not |
2256 | used by __kmp_wait_template() in that case. */ |
2257 | if (__kmp_dflt_blocktime != KMP_MAX_BLOCKTIME) { |
2258 | #if KMP_USE_MONITOR |
2259 | this_thr->th.th_team_bt_intervals = |
2260 | team->t.t_implicit_task_taskdata[tid].td_icvs.bt_intervals; |
2261 | this_thr->th.th_team_bt_set = |
2262 | team->t.t_implicit_task_taskdata[tid].td_icvs.bt_set; |
2263 | #else |
2264 | this_thr->th.th_team_bt_intervals = KMP_BLOCKTIME_INTERVAL(team, tid); |
2265 | #endif |
2266 | } |
2267 | |
2268 | #if USE_ITT_BUILD |
2269 | if (__itt_sync_create_ptr || KMP_ITT_DEBUG) |
2270 | __kmp_itt_barrier_starting(gtid, object: itt_sync_obj); |
2271 | #endif /* USE_ITT_BUILD */ |
2272 | |
2273 | switch (__kmp_barrier_gather_pattern[bs_forkjoin_barrier]) { |
2274 | case bp_dist_bar: { |
2275 | __kmp_dist_barrier_gather(bt: bs_forkjoin_barrier, this_thr, gtid, tid, |
2276 | NULL USE_ITT_BUILD_ARG(itt_sync_obj)); |
2277 | break; |
2278 | } |
2279 | case bp_hyper_bar: { |
2280 | KMP_ASSERT(__kmp_barrier_gather_branch_bits[bs_forkjoin_barrier]); |
2281 | __kmp_hyper_barrier_gather(bt: bs_forkjoin_barrier, this_thr, gtid, tid, |
2282 | NULL USE_ITT_BUILD_ARG(itt_sync_obj)); |
2283 | break; |
2284 | } |
2285 | case bp_hierarchical_bar: { |
2286 | __kmp_hierarchical_barrier_gather(bt: bs_forkjoin_barrier, this_thr, gtid, tid, |
2287 | NULL USE_ITT_BUILD_ARG(itt_sync_obj)); |
2288 | break; |
2289 | } |
2290 | case bp_tree_bar: { |
2291 | KMP_ASSERT(__kmp_barrier_gather_branch_bits[bs_forkjoin_barrier]); |
2292 | __kmp_tree_barrier_gather(bt: bs_forkjoin_barrier, this_thr, gtid, tid, |
2293 | NULL USE_ITT_BUILD_ARG(itt_sync_obj)); |
2294 | break; |
2295 | } |
2296 | default: { |
2297 | __kmp_linear_barrier_gather(bt: bs_forkjoin_barrier, this_thr, gtid, tid, |
2298 | NULL USE_ITT_BUILD_ARG(itt_sync_obj)); |
2299 | } |
2300 | } |
2301 | |
2302 | /* From this point on, the team data structure may be deallocated at any time |
2303 | by the primary thread - it is unsafe to reference it in any of the worker |
2304 | threads. Any per-team data items that need to be referenced before the |
2305 | end of the barrier should be moved to the kmp_task_team_t structs. */ |
2306 | if (KMP_MASTER_TID(tid)) { |
2307 | if (__kmp_tasking_mode != tskm_immediate_exec) { |
2308 | __kmp_task_team_wait(this_thr, team USE_ITT_BUILD_ARG(itt_sync_obj)); |
2309 | } |
2310 | if (__kmp_display_affinity) { |
2311 | KMP_CHECK_UPDATE(team->t.t_display_affinity, 0); |
2312 | } |
2313 | #if KMP_STATS_ENABLED |
2314 | // Have primary thread flag the workers to indicate they are now waiting for |
2315 | // next parallel region, Also wake them up so they switch their timers to |
2316 | // idle. |
2317 | for (int i = 0; i < team->t.t_nproc; ++i) { |
2318 | kmp_info_t *team_thread = team->t.t_threads[i]; |
2319 | if (team_thread == this_thr) |
2320 | continue; |
2321 | team_thread->th.th_stats->setIdleFlag(); |
2322 | if (__kmp_dflt_blocktime != KMP_MAX_BLOCKTIME && |
2323 | team_thread->th.th_sleep_loc != NULL) |
2324 | __kmp_null_resume_wrapper(team_thread); |
2325 | } |
2326 | #endif |
2327 | #if USE_ITT_BUILD |
2328 | if (__itt_sync_create_ptr || KMP_ITT_DEBUG) |
2329 | __kmp_itt_barrier_middle(gtid, object: itt_sync_obj); |
2330 | #endif /* USE_ITT_BUILD */ |
2331 | |
2332 | #if USE_ITT_BUILD && USE_ITT_NOTIFY |
2333 | // Join barrier - report frame end |
2334 | if ((__itt_frame_submit_v3_ptr || KMP_ITT_DEBUG) && |
2335 | __kmp_forkjoin_frames_mode && |
2336 | (this_thr->th.th_teams_microtask == NULL || // either not in teams |
2337 | this_thr->th.th_teams_size.nteams == 1) && // or inside single team |
2338 | team->t.t_active_level == 1) { |
2339 | kmp_uint64 cur_time = __itt_get_timestamp(); |
2340 | ident_t *loc = team->t.t_ident; |
2341 | kmp_info_t **other_threads = team->t.t_threads; |
2342 | switch (__kmp_forkjoin_frames_mode) { |
2343 | case 1: |
2344 | __kmp_itt_frame_submit(gtid, begin: this_thr->th.th_frame_time, end: cur_time, imbalance: 0, |
2345 | loc, team_size: nproc); |
2346 | break; |
2347 | case 2: |
2348 | __kmp_itt_frame_submit(gtid, begin: this_thr->th.th_bar_min_time, end: cur_time, imbalance: 1, |
2349 | loc, team_size: nproc); |
2350 | break; |
2351 | case 3: |
2352 | if (__itt_metadata_add_ptr) { |
2353 | // Initialize with primary thread's wait time |
2354 | kmp_uint64 delta = cur_time - this_thr->th.th_bar_arrive_time; |
2355 | // Set arrive time to zero to be able to check it in |
2356 | // __kmp_invoke_task(); the same is done inside the loop below |
2357 | this_thr->th.th_bar_arrive_time = 0; |
2358 | for (int i = 1; i < nproc; ++i) { |
2359 | delta += (cur_time - other_threads[i]->th.th_bar_arrive_time); |
2360 | other_threads[i]->th.th_bar_arrive_time = 0; |
2361 | } |
2362 | __kmp_itt_metadata_imbalance(gtid, begin: this_thr->th.th_frame_time, |
2363 | end: cur_time, imbalance: delta, reduction: 0); |
2364 | } |
2365 | __kmp_itt_frame_submit(gtid, begin: this_thr->th.th_frame_time, end: cur_time, imbalance: 0, |
2366 | loc, team_size: nproc); |
2367 | this_thr->th.th_frame_time = cur_time; |
2368 | break; |
2369 | } |
2370 | } |
2371 | #endif /* USE_ITT_BUILD */ |
2372 | } |
2373 | #if USE_ITT_BUILD |
2374 | else { |
2375 | if (__itt_sync_create_ptr || KMP_ITT_DEBUG) |
2376 | __kmp_itt_barrier_middle(gtid, object: itt_sync_obj); |
2377 | } |
2378 | #endif /* USE_ITT_BUILD */ |
2379 | |
2380 | #if KMP_DEBUG |
2381 | if (KMP_MASTER_TID(tid)) { |
2382 | KA_TRACE( |
2383 | 15, |
2384 | ("__kmp_join_barrier: T#%d(%d:%d) says all %d team threads arrived\n" , |
2385 | gtid, team_id, tid, nproc)); |
2386 | } |
2387 | #endif /* KMP_DEBUG */ |
2388 | |
2389 | // TODO now, mark worker threads as done so they may be disbanded |
2390 | KMP_MB(); // Flush all pending memory write invalidates. |
2391 | KA_TRACE(10, |
2392 | ("__kmp_join_barrier: T#%d(%d:%d) leaving\n" , gtid, team_id, tid)); |
2393 | |
2394 | } |
2395 | |
2396 | // TODO release worker threads' fork barriers as we are ready instead of all at |
2397 | // once |
2398 | void __kmp_fork_barrier(int gtid, int tid) { |
2399 | KMP_TIME_PARTITIONED_BLOCK(OMP_fork_barrier); |
2400 | KMP_SET_THREAD_STATE_BLOCK(FORK_JOIN_BARRIER); |
2401 | kmp_info_t *this_thr = __kmp_threads[gtid]; |
2402 | kmp_team_t *team = (tid == 0) ? this_thr->th.th_team : NULL; |
2403 | #if USE_ITT_BUILD |
2404 | void *itt_sync_obj = NULL; |
2405 | #endif /* USE_ITT_BUILD */ |
2406 | #ifdef KMP_DEBUG |
2407 | if (team) |
2408 | KA_TRACE(10, ("__kmp_fork_barrier: T#%d(%d:%d) has arrived\n" , gtid, |
2409 | (team != NULL) ? team->t.t_id : -1, tid)); |
2410 | #endif |
2411 | // th_team pointer only valid for primary thread here |
2412 | if (KMP_MASTER_TID(tid)) { |
2413 | #if USE_ITT_BUILD && USE_ITT_NOTIFY |
2414 | if (__itt_sync_create_ptr || KMP_ITT_DEBUG) { |
2415 | // Create itt barrier object |
2416 | itt_sync_obj = __kmp_itt_barrier_object(gtid, bt: bs_forkjoin_barrier, set_name: 1); |
2417 | __kmp_itt_barrier_middle(gtid, object: itt_sync_obj); // Call acquired/releasing |
2418 | } |
2419 | #endif /* USE_ITT_BUILD && USE_ITT_NOTIFY */ |
2420 | |
2421 | #ifdef KMP_DEBUG |
2422 | KMP_DEBUG_ASSERT(team); |
2423 | kmp_info_t **other_threads = team->t.t_threads; |
2424 | int i; |
2425 | |
2426 | // Verify state |
2427 | KMP_MB(); |
2428 | |
2429 | for (i = 1; i < team->t.t_nproc; ++i) { |
2430 | KA_TRACE(500, |
2431 | ("__kmp_fork_barrier: T#%d(%d:0) checking T#%d(%d:%d) fork go " |
2432 | "== %u.\n" , |
2433 | gtid, team->t.t_id, other_threads[i]->th.th_info.ds.ds_gtid, |
2434 | team->t.t_id, other_threads[i]->th.th_info.ds.ds_tid, |
2435 | other_threads[i]->th.th_bar[bs_forkjoin_barrier].bb.b_go)); |
2436 | KMP_DEBUG_ASSERT( |
2437 | (TCR_4(other_threads[i]->th.th_bar[bs_forkjoin_barrier].bb.b_go) & |
2438 | ~(KMP_BARRIER_SLEEP_STATE)) == KMP_INIT_BARRIER_STATE); |
2439 | KMP_DEBUG_ASSERT(other_threads[i]->th.th_team == team); |
2440 | } |
2441 | #endif |
2442 | |
2443 | if (__kmp_tasking_mode != tskm_immediate_exec) { |
2444 | // 0 indicates setup current task team if nthreads > 1 |
2445 | __kmp_task_team_setup(this_thr, team, always: 0); |
2446 | } |
2447 | |
2448 | /* The primary thread may have changed its blocktime between join barrier |
2449 | and fork barrier. Copy the blocktime info to the thread, where |
2450 | __kmp_wait_template() can access it when the team struct is not |
2451 | guaranteed to exist. */ |
2452 | // See note about the corresponding code in __kmp_join_barrier() being |
2453 | // performance-critical |
2454 | if (__kmp_dflt_blocktime != KMP_MAX_BLOCKTIME) { |
2455 | #if KMP_USE_MONITOR |
2456 | this_thr->th.th_team_bt_intervals = |
2457 | team->t.t_implicit_task_taskdata[tid].td_icvs.bt_intervals; |
2458 | this_thr->th.th_team_bt_set = |
2459 | team->t.t_implicit_task_taskdata[tid].td_icvs.bt_set; |
2460 | #else |
2461 | this_thr->th.th_team_bt_intervals = KMP_BLOCKTIME_INTERVAL(team, tid); |
2462 | #endif |
2463 | } |
2464 | } // primary thread |
2465 | |
2466 | switch (__kmp_barrier_release_pattern[bs_forkjoin_barrier]) { |
2467 | case bp_dist_bar: { |
2468 | __kmp_dist_barrier_release(bt: bs_forkjoin_barrier, this_thr, gtid, tid, |
2469 | TRUE USE_ITT_BUILD_ARG(NULL)); |
2470 | break; |
2471 | } |
2472 | case bp_hyper_bar: { |
2473 | KMP_ASSERT(__kmp_barrier_release_branch_bits[bs_forkjoin_barrier]); |
2474 | __kmp_hyper_barrier_release(bt: bs_forkjoin_barrier, this_thr, gtid, tid, |
2475 | TRUE USE_ITT_BUILD_ARG(itt_sync_obj)); |
2476 | break; |
2477 | } |
2478 | case bp_hierarchical_bar: { |
2479 | __kmp_hierarchical_barrier_release(bt: bs_forkjoin_barrier, this_thr, gtid, tid, |
2480 | TRUE USE_ITT_BUILD_ARG(itt_sync_obj)); |
2481 | break; |
2482 | } |
2483 | case bp_tree_bar: { |
2484 | KMP_ASSERT(__kmp_barrier_release_branch_bits[bs_forkjoin_barrier]); |
2485 | __kmp_tree_barrier_release(bt: bs_forkjoin_barrier, this_thr, gtid, tid, |
2486 | TRUE USE_ITT_BUILD_ARG(itt_sync_obj)); |
2487 | break; |
2488 | } |
2489 | default: { |
2490 | __kmp_linear_barrier_release(bt: bs_forkjoin_barrier, this_thr, gtid, tid, |
2491 | TRUE USE_ITT_BUILD_ARG(itt_sync_obj)); |
2492 | } |
2493 | } |
2494 | |
2495 | #if OMPT_SUPPORT |
2496 | if (ompt_enabled.enabled && |
2497 | this_thr->th.ompt_thread_info.state == ompt_state_wait_barrier_implicit) { |
2498 | int ds_tid = this_thr->th.th_info.ds.ds_tid; |
2499 | ompt_data_t *task_data = (team) |
2500 | ? OMPT_CUR_TASK_DATA(this_thr) |
2501 | : &(this_thr->th.ompt_thread_info.task_data); |
2502 | this_thr->th.ompt_thread_info.state = ompt_state_overhead; |
2503 | #if OMPT_OPTIONAL |
2504 | void *codeptr = NULL; |
2505 | if (KMP_MASTER_TID(ds_tid) && |
2506 | (ompt_callbacks.ompt_callback(ompt_callback_sync_region_wait) || |
2507 | ompt_callbacks.ompt_callback(ompt_callback_sync_region))) |
2508 | codeptr = team ? team->t.ompt_team_info.master_return_address : NULL; |
2509 | if (ompt_enabled.ompt_callback_sync_region_wait) { |
2510 | ompt_callbacks.ompt_callback(ompt_callback_sync_region_wait)( |
2511 | ompt_sync_region_barrier_implicit, ompt_scope_end, NULL, task_data, |
2512 | codeptr); |
2513 | } |
2514 | if (ompt_enabled.ompt_callback_sync_region) { |
2515 | ompt_callbacks.ompt_callback(ompt_callback_sync_region)( |
2516 | ompt_sync_region_barrier_implicit, ompt_scope_end, NULL, task_data, |
2517 | codeptr); |
2518 | } |
2519 | #endif |
2520 | if (!KMP_MASTER_TID(ds_tid) && ompt_enabled.ompt_callback_implicit_task) { |
2521 | ompt_callbacks.ompt_callback(ompt_callback_implicit_task)( |
2522 | ompt_scope_end, NULL, task_data, 0, ds_tid, |
2523 | ompt_task_implicit); // TODO: Can this be ompt_task_initial? |
2524 | } |
2525 | } |
2526 | #endif |
2527 | |
2528 | // Early exit for reaping threads releasing forkjoin barrier |
2529 | if (TCR_4(__kmp_global.g.g_done)) { |
2530 | this_thr->th.th_task_team = NULL; |
2531 | |
2532 | #if USE_ITT_BUILD && USE_ITT_NOTIFY |
2533 | if (__itt_sync_create_ptr || KMP_ITT_DEBUG) { |
2534 | if (!KMP_MASTER_TID(tid)) { |
2535 | itt_sync_obj = __kmp_itt_barrier_object(gtid, bt: bs_forkjoin_barrier); |
2536 | if (itt_sync_obj) |
2537 | __kmp_itt_barrier_finished(gtid, object: itt_sync_obj); |
2538 | } |
2539 | } |
2540 | #endif /* USE_ITT_BUILD && USE_ITT_NOTIFY */ |
2541 | KA_TRACE(10, ("__kmp_fork_barrier: T#%d is leaving early\n" , gtid)); |
2542 | return; |
2543 | } |
2544 | |
2545 | /* We can now assume that a valid team structure has been allocated by the |
2546 | primary thread and propagated to all worker threads. The current thread, |
2547 | however, may not be part of the team, so we can't blindly assume that the |
2548 | team pointer is non-null. */ |
2549 | team = (kmp_team_t *)TCR_PTR(this_thr->th.th_team); |
2550 | KMP_DEBUG_ASSERT(team != NULL); |
2551 | tid = __kmp_tid_from_gtid(gtid); |
2552 | |
2553 | #if KMP_BARRIER_ICV_PULL |
2554 | /* Primary thread's copy of the ICVs was set up on the implicit taskdata in |
2555 | __kmp_reinitialize_team. __kmp_fork_call() assumes the primary thread's |
2556 | implicit task has this data before this function is called. We cannot |
2557 | modify __kmp_fork_call() to look at the fixed ICVs in the primary thread's |
2558 | thread struct, because it is not always the case that the threads arrays |
2559 | have been allocated when __kmp_fork_call() is executed. */ |
2560 | { |
2561 | KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(USER_icv_copy); |
2562 | if (!KMP_MASTER_TID(tid)) { // primary thread already has ICVs |
2563 | // Copy the initial ICVs from the primary thread's thread struct to the |
2564 | // implicit task for this tid. |
2565 | KA_TRACE(10, |
2566 | ("__kmp_fork_barrier: T#%d(%d) is PULLing ICVs\n" , gtid, tid)); |
2567 | __kmp_init_implicit_task(team->t.t_ident, team->t.t_threads[tid], team, |
2568 | tid, FALSE); |
2569 | copy_icvs(&team->t.t_implicit_task_taskdata[tid].td_icvs, |
2570 | &team->t.t_threads[0] |
2571 | ->th.th_bar[bs_forkjoin_barrier] |
2572 | .bb.th_fixed_icvs); |
2573 | } |
2574 | } |
2575 | #endif // KMP_BARRIER_ICV_PULL |
2576 | |
2577 | if (__kmp_tasking_mode != tskm_immediate_exec) { |
2578 | __kmp_task_team_sync(this_thr, team); |
2579 | } |
2580 | |
2581 | #if KMP_AFFINITY_SUPPORTED |
2582 | kmp_proc_bind_t proc_bind = team->t.t_proc_bind; |
2583 | if (proc_bind == proc_bind_intel) { |
2584 | // Call dynamic affinity settings |
2585 | if (__kmp_affinity.type == affinity_balanced && team->t.t_size_changed) { |
2586 | __kmp_balanced_affinity(th: this_thr, team_size: team->t.t_nproc); |
2587 | } |
2588 | } else if (proc_bind != proc_bind_false) { |
2589 | if (this_thr->th.th_new_place == this_thr->th.th_current_place) { |
2590 | KA_TRACE(100, ("__kmp_fork_barrier: T#%d already in correct place %d\n" , |
2591 | __kmp_gtid_from_thread(this_thr), |
2592 | this_thr->th.th_current_place)); |
2593 | } else { |
2594 | __kmp_affinity_bind_place(gtid); |
2595 | } |
2596 | } |
2597 | #endif // KMP_AFFINITY_SUPPORTED |
2598 | // Perform the display affinity functionality |
2599 | if (__kmp_display_affinity) { |
2600 | if (team->t.t_display_affinity |
2601 | #if KMP_AFFINITY_SUPPORTED |
2602 | || (__kmp_affinity.type == affinity_balanced && team->t.t_size_changed) |
2603 | #endif |
2604 | ) { |
2605 | // NULL means use the affinity-format-var ICV |
2606 | __kmp_aux_display_affinity(gtid, NULL); |
2607 | this_thr->th.th_prev_num_threads = team->t.t_nproc; |
2608 | this_thr->th.th_prev_level = team->t.t_level; |
2609 | } |
2610 | } |
2611 | if (!KMP_MASTER_TID(tid)) |
2612 | KMP_CHECK_UPDATE(this_thr->th.th_def_allocator, team->t.t_def_allocator); |
2613 | |
2614 | #if USE_ITT_BUILD && USE_ITT_NOTIFY |
2615 | if (__itt_sync_create_ptr || KMP_ITT_DEBUG) { |
2616 | if (!KMP_MASTER_TID(tid)) { |
2617 | // Get correct barrier object |
2618 | itt_sync_obj = __kmp_itt_barrier_object(gtid, bt: bs_forkjoin_barrier); |
2619 | __kmp_itt_barrier_finished(gtid, object: itt_sync_obj); // Workers call acquired |
2620 | } // (prepare called inside barrier_release) |
2621 | } |
2622 | #endif /* USE_ITT_BUILD && USE_ITT_NOTIFY */ |
2623 | KA_TRACE(10, ("__kmp_fork_barrier: T#%d(%d:%d) is leaving\n" , gtid, |
2624 | team->t.t_id, tid)); |
2625 | } |
2626 | |
2627 | void __kmp_setup_icv_copy(kmp_team_t *team, int new_nproc, |
2628 | kmp_internal_control_t *new_icvs, ident_t *loc) { |
2629 | KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_setup_icv_copy); |
2630 | |
2631 | KMP_DEBUG_ASSERT(team && new_nproc && new_icvs); |
2632 | KMP_DEBUG_ASSERT((!TCR_4(__kmp_init_parallel)) || new_icvs->nproc); |
2633 | |
2634 | /* Primary thread's copy of the ICVs was set up on the implicit taskdata in |
2635 | __kmp_reinitialize_team. __kmp_fork_call() assumes the primary thread's |
2636 | implicit task has this data before this function is called. */ |
2637 | #if KMP_BARRIER_ICV_PULL |
2638 | /* Copy ICVs to primary thread's thread structure into th_fixed_icvs (which |
2639 | remains untouched), where all of the worker threads can access them and |
2640 | make their own copies after the barrier. */ |
2641 | KMP_DEBUG_ASSERT(team->t.t_threads[0]); // The threads arrays should be |
2642 | // allocated at this point |
2643 | copy_icvs( |
2644 | &team->t.t_threads[0]->th.th_bar[bs_forkjoin_barrier].bb.th_fixed_icvs, |
2645 | new_icvs); |
2646 | KF_TRACE(10, ("__kmp_setup_icv_copy: PULL: T#%d this_thread=%p team=%p\n" , 0, |
2647 | team->t.t_threads[0], team)); |
2648 | #elif KMP_BARRIER_ICV_PUSH |
2649 | // The ICVs will be propagated in the fork barrier, so nothing needs to be |
2650 | // done here. |
2651 | KF_TRACE(10, ("__kmp_setup_icv_copy: PUSH: T#%d this_thread=%p team=%p\n" , 0, |
2652 | team->t.t_threads[0], team)); |
2653 | #else |
2654 | // Copy the ICVs to each of the non-primary threads. This takes O(nthreads) |
2655 | // time. |
2656 | ngo_load(new_icvs); |
2657 | KMP_DEBUG_ASSERT(team->t.t_threads[0]); // The threads arrays should be |
2658 | // allocated at this point |
2659 | for (int f = 1; f < new_nproc; ++f) { // Skip the primary thread |
2660 | // TODO: GEH - pass in better source location info since usually NULL here |
2661 | KF_TRACE(10, ("__kmp_setup_icv_copy: LINEAR: T#%d this_thread=%p team=%p\n" , |
2662 | f, team->t.t_threads[f], team)); |
2663 | __kmp_init_implicit_task(loc, team->t.t_threads[f], team, f, FALSE); |
2664 | ngo_store_icvs(&team->t.t_implicit_task_taskdata[f].td_icvs, new_icvs); |
2665 | KF_TRACE(10, ("__kmp_setup_icv_copy: LINEAR: T#%d this_thread=%p team=%p\n" , |
2666 | f, team->t.t_threads[f], team)); |
2667 | } |
2668 | ngo_sync(); |
2669 | #endif // KMP_BARRIER_ICV_PULL |
2670 | } |
2671 | |