1 | /* |
2 | * kmp_barrier.cpp |
3 | */ |
4 | |
5 | //===----------------------------------------------------------------------===// |
6 | // |
7 | // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. |
8 | // See https://llvm.org/LICENSE.txt for license information. |
9 | // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception |
10 | // |
11 | //===----------------------------------------------------------------------===// |
12 | |
13 | #include "kmp_wait_release.h" |
14 | #include "kmp_barrier.h" |
15 | #include "kmp_itt.h" |
16 | #include "kmp_os.h" |
17 | #include "kmp_stats.h" |
18 | #include "ompt-specific.h" |
19 | // for distributed barrier |
20 | #include "kmp_affinity.h" |
21 | |
22 | #if KMP_MIC |
23 | #include <immintrin.h> |
24 | #define USE_NGO_STORES 1 |
25 | #endif // KMP_MIC |
26 | |
27 | #if KMP_MIC && USE_NGO_STORES |
28 | // ICV copying |
29 | #define ngo_load(src) __m512d Vt = _mm512_load_pd((void *)(src)) |
30 | #define ngo_store_icvs(dst, src) _mm512_storenrngo_pd((void *)(dst), Vt) |
31 | #define ngo_store_go(dst, src) _mm512_storenrngo_pd((void *)(dst), Vt) |
32 | #define ngo_sync() __asm__ volatile("lock; addl $0,0(%%rsp)" ::: "memory") |
33 | #else |
34 | #define ngo_load(src) ((void)0) |
35 | #define ngo_store_icvs(dst, src) copy_icvs((dst), (src)) |
36 | #define ngo_store_go(dst, src) KMP_MEMCPY((dst), (src), CACHE_LINE) |
37 | #define ngo_sync() ((void)0) |
38 | #endif /* KMP_MIC && USE_NGO_STORES */ |
39 | |
40 | void __kmp_print_structure(void); // Forward declaration |
41 | |
42 | // ---------------------------- Barrier Algorithms ---------------------------- |
43 | // Distributed barrier |
44 | |
45 | // Compute how many threads to have polling each cache-line. |
46 | // We want to limit the number of writes to IDEAL_GO_RESOLUTION. |
47 | void distributedBarrier::computeVarsForN(size_t n) { |
48 | int nsockets = 1; |
49 | if (__kmp_topology) { |
50 | int socket_level = __kmp_topology->get_level(type: KMP_HW_SOCKET); |
51 | int core_level = __kmp_topology->get_level(type: KMP_HW_CORE); |
52 | int ncores_per_socket = |
53 | __kmp_topology->calculate_ratio(level1: core_level, level2: socket_level); |
54 | nsockets = __kmp_topology->get_count(level: socket_level); |
55 | |
56 | if (nsockets <= 0) |
57 | nsockets = 1; |
58 | if (ncores_per_socket <= 0) |
59 | ncores_per_socket = 1; |
60 | |
61 | threads_per_go = ncores_per_socket >> 1; |
62 | if (!fix_threads_per_go) { |
63 | // Minimize num_gos |
64 | if (threads_per_go > 4) { |
65 | if (KMP_OPTIMIZE_FOR_REDUCTIONS) { |
66 | threads_per_go = threads_per_go >> 1; |
67 | } |
68 | if (threads_per_go > 4 && nsockets == 1) |
69 | threads_per_go = threads_per_go >> 1; |
70 | } |
71 | } |
72 | if (threads_per_go == 0) |
73 | threads_per_go = 1; |
74 | fix_threads_per_go = true; |
75 | num_gos = n / threads_per_go; |
76 | if (n % threads_per_go) |
77 | num_gos++; |
78 | if (nsockets == 1 || num_gos == 1) |
79 | num_groups = 1; |
80 | else { |
81 | num_groups = num_gos / nsockets; |
82 | if (num_gos % nsockets) |
83 | num_groups++; |
84 | } |
85 | if (num_groups <= 0) |
86 | num_groups = 1; |
87 | gos_per_group = num_gos / num_groups; |
88 | if (num_gos % num_groups) |
89 | gos_per_group++; |
90 | threads_per_group = threads_per_go * gos_per_group; |
91 | } else { |
92 | num_gos = n / threads_per_go; |
93 | if (n % threads_per_go) |
94 | num_gos++; |
95 | if (num_gos == 1) |
96 | num_groups = 1; |
97 | else { |
98 | num_groups = num_gos / 2; |
99 | if (num_gos % 2) |
100 | num_groups++; |
101 | } |
102 | gos_per_group = num_gos / num_groups; |
103 | if (num_gos % num_groups) |
104 | gos_per_group++; |
105 | threads_per_group = threads_per_go * gos_per_group; |
106 | } |
107 | } |
108 | |
109 | void distributedBarrier::computeGo(size_t n) { |
110 | // Minimize num_gos |
111 | for (num_gos = 1;; num_gos++) |
112 | if (IDEAL_CONTENTION * num_gos >= n) |
113 | break; |
114 | threads_per_go = n / num_gos; |
115 | if (n % num_gos) |
116 | threads_per_go++; |
117 | while (num_gos > MAX_GOS) { |
118 | threads_per_go++; |
119 | num_gos = n / threads_per_go; |
120 | if (n % threads_per_go) |
121 | num_gos++; |
122 | } |
123 | computeVarsForN(n); |
124 | } |
125 | |
126 | // This function is to resize the barrier arrays when the new number of threads |
127 | // exceeds max_threads, which is the current size of all the arrays |
128 | void distributedBarrier::resize(size_t nthr) { |
129 | KMP_DEBUG_ASSERT(nthr > max_threads); |
130 | |
131 | // expand to requested size * 2 |
132 | max_threads = nthr * 2; |
133 | |
134 | // allocate arrays to new max threads |
135 | for (int i = 0; i < MAX_ITERS; ++i) { |
136 | if (flags[i]) |
137 | flags[i] = (flags_s *)KMP_INTERNAL_REALLOC(flags[i], |
138 | max_threads * sizeof(flags_s)); |
139 | else |
140 | flags[i] = (flags_s *)KMP_INTERNAL_MALLOC(max_threads * sizeof(flags_s)); |
141 | } |
142 | |
143 | if (go) |
144 | go = (go_s *)KMP_INTERNAL_REALLOC(go, max_threads * sizeof(go_s)); |
145 | else |
146 | go = (go_s *)KMP_INTERNAL_MALLOC(max_threads * sizeof(go_s)); |
147 | |
148 | if (iter) |
149 | iter = (iter_s *)KMP_INTERNAL_REALLOC(iter, max_threads * sizeof(iter_s)); |
150 | else |
151 | iter = (iter_s *)KMP_INTERNAL_MALLOC(max_threads * sizeof(iter_s)); |
152 | |
153 | if (sleep) |
154 | sleep = |
155 | (sleep_s *)KMP_INTERNAL_REALLOC(sleep, max_threads * sizeof(sleep_s)); |
156 | else |
157 | sleep = (sleep_s *)KMP_INTERNAL_MALLOC(max_threads * sizeof(sleep_s)); |
158 | } |
159 | |
160 | // This function is to set all the go flags that threads might be waiting |
161 | // on, and when blocktime is not infinite, it should be followed by a wake-up |
162 | // call to each thread |
163 | kmp_uint64 distributedBarrier::go_release() { |
164 | kmp_uint64 next_go = iter[0].iter + distributedBarrier::MAX_ITERS; |
165 | for (size_t j = 0; j < num_gos; j++) { |
166 | go[j].go.store(i: next_go); |
167 | } |
168 | return next_go; |
169 | } |
170 | |
171 | void distributedBarrier::go_reset() { |
172 | for (size_t j = 0; j < max_threads; ++j) { |
173 | for (size_t i = 0; i < distributedBarrier::MAX_ITERS; ++i) { |
174 | flags[i][j].stillNeed = 1; |
175 | } |
176 | go[j].go.store(i: 0); |
177 | iter[j].iter = 0; |
178 | } |
179 | } |
180 | |
181 | // This function inits/re-inits the distributed barrier for a particular number |
182 | // of threads. If a resize of arrays is needed, it calls the resize function. |
183 | void distributedBarrier::init(size_t nthr) { |
184 | size_t old_max = max_threads; |
185 | if (nthr > max_threads) { // need more space in arrays |
186 | resize(nthr); |
187 | } |
188 | |
189 | for (size_t i = 0; i < max_threads; i++) { |
190 | for (size_t j = 0; j < distributedBarrier::MAX_ITERS; j++) { |
191 | flags[j][i].stillNeed = 1; |
192 | } |
193 | go[i].go.store(i: 0); |
194 | iter[i].iter = 0; |
195 | if (i >= old_max) |
196 | sleep[i].sleep = false; |
197 | } |
198 | |
199 | // Recalculate num_gos, etc. based on new nthr |
200 | computeVarsForN(n: nthr); |
201 | |
202 | num_threads = nthr; |
203 | |
204 | if (team_icvs == NULL) |
205 | team_icvs = __kmp_allocate(sizeof(kmp_internal_control_t)); |
206 | } |
207 | |
208 | // This function is used only when KMP_BLOCKTIME is not infinite. |
209 | // static |
210 | void __kmp_dist_barrier_wakeup(enum barrier_type bt, kmp_team_t *team, |
211 | size_t start, size_t stop, size_t inc, |
212 | size_t tid) { |
213 | KMP_DEBUG_ASSERT(__kmp_dflt_blocktime != KMP_MAX_BLOCKTIME); |
214 | if (bt == bs_forkjoin_barrier && TCR_4(__kmp_global.g.g_done)) |
215 | return; |
216 | |
217 | kmp_info_t **other_threads = team->t.t_threads; |
218 | for (size_t thr = start; thr < stop; thr += inc) { |
219 | KMP_DEBUG_ASSERT(other_threads[thr]); |
220 | int gtid = other_threads[thr]->th.th_info.ds.ds_gtid; |
221 | // Wake up worker regardless of if it appears to be sleeping or not |
222 | __kmp_atomic_resume_64(target_gtid: gtid, flag: (kmp_atomic_flag_64<> *)NULL); |
223 | } |
224 | } |
225 | |
226 | static void __kmp_dist_barrier_gather( |
227 | enum barrier_type bt, kmp_info_t *this_thr, int gtid, int tid, |
228 | void (*reduce)(void *, void *) USE_ITT_BUILD_ARG(void *itt_sync_obj)) { |
229 | KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_dist_gather); |
230 | kmp_team_t *team; |
231 | distributedBarrier *b; |
232 | kmp_info_t **other_threads; |
233 | kmp_uint64 my_current_iter, my_next_iter; |
234 | kmp_uint32 nproc; |
235 | bool group_leader; |
236 | |
237 | team = this_thr->th.th_team; |
238 | nproc = this_thr->th.th_team_nproc; |
239 | other_threads = team->t.t_threads; |
240 | b = team->t.b; |
241 | my_current_iter = b->iter[tid].iter; |
242 | my_next_iter = (my_current_iter + 1) % distributedBarrier::MAX_ITERS; |
243 | group_leader = ((tid % b->threads_per_group) == 0); |
244 | |
245 | KA_TRACE(20, |
246 | ("__kmp_dist_barrier_gather: T#%d(%d:%d) enter; barrier type %d\n" , |
247 | gtid, team->t.t_id, tid, bt)); |
248 | |
249 | #if USE_ITT_BUILD && USE_ITT_NOTIFY |
250 | // Barrier imbalance - save arrive time to the thread |
251 | if (__kmp_forkjoin_frames_mode == 3 || __kmp_forkjoin_frames_mode == 2) { |
252 | this_thr->th.th_bar_arrive_time = this_thr->th.th_bar_min_time = |
253 | __itt_get_timestamp(); |
254 | } |
255 | #endif |
256 | |
257 | if (group_leader) { |
258 | // Start from the thread after the group leader |
259 | size_t group_start = tid + 1; |
260 | size_t group_end = tid + b->threads_per_group; |
261 | size_t threads_pending = 0; |
262 | |
263 | if (group_end > nproc) |
264 | group_end = nproc; |
265 | do { // wait for threads in my group |
266 | threads_pending = 0; |
267 | // Check all the flags every time to avoid branch misspredict |
268 | for (size_t thr = group_start; thr < group_end; thr++) { |
269 | // Each thread uses a different cache line |
270 | threads_pending += b->flags[my_current_iter][thr].stillNeed; |
271 | } |
272 | // Execute tasks here |
273 | if (__kmp_tasking_mode != tskm_immediate_exec) { |
274 | kmp_task_team_t *task_team = this_thr->th.th_task_team; |
275 | if (task_team != NULL) { |
276 | if (TCR_SYNC_4(task_team->tt.tt_active)) { |
277 | if (KMP_TASKING_ENABLED(task_team)) { |
278 | int tasks_completed = FALSE; |
279 | __kmp_atomic_execute_tasks_64( |
280 | thread: this_thr, gtid, flag: (kmp_atomic_flag_64<> *)NULL, FALSE, |
281 | thread_finished: &tasks_completed USE_ITT_BUILD_ARG(itt_sync_obj), is_constrained: 0); |
282 | } else |
283 | this_thr->th.th_reap_state = KMP_SAFE_TO_REAP; |
284 | } |
285 | } else { |
286 | this_thr->th.th_reap_state = KMP_SAFE_TO_REAP; |
287 | } // if |
288 | } |
289 | if (TCR_4(__kmp_global.g.g_done)) { |
290 | if (__kmp_global.g.g_abort) |
291 | __kmp_abort_thread(); |
292 | break; |
293 | } else if (__kmp_tasking_mode != tskm_immediate_exec && |
294 | this_thr->th.th_reap_state == KMP_SAFE_TO_REAP) { |
295 | this_thr->th.th_reap_state = KMP_NOT_SAFE_TO_REAP; |
296 | } |
297 | } while (threads_pending > 0); |
298 | |
299 | if (reduce) { // Perform reduction if needed |
300 | OMPT_REDUCTION_DECL(this_thr, gtid); |
301 | OMPT_REDUCTION_BEGIN; |
302 | // Group leader reduces all threads in group |
303 | for (size_t thr = group_start; thr < group_end; thr++) { |
304 | (*reduce)(this_thr->th.th_local.reduce_data, |
305 | other_threads[thr]->th.th_local.reduce_data); |
306 | } |
307 | OMPT_REDUCTION_END; |
308 | } |
309 | |
310 | // Set flag for next iteration |
311 | b->flags[my_next_iter][tid].stillNeed = 1; |
312 | // Each thread uses a different cache line; resets stillNeed to 0 to |
313 | // indicate it has reached the barrier |
314 | b->flags[my_current_iter][tid].stillNeed = 0; |
315 | |
316 | do { // wait for all group leaders |
317 | threads_pending = 0; |
318 | for (size_t thr = 0; thr < nproc; thr += b->threads_per_group) { |
319 | threads_pending += b->flags[my_current_iter][thr].stillNeed; |
320 | } |
321 | // Execute tasks here |
322 | if (__kmp_tasking_mode != tskm_immediate_exec) { |
323 | kmp_task_team_t *task_team = this_thr->th.th_task_team; |
324 | if (task_team != NULL) { |
325 | if (TCR_SYNC_4(task_team->tt.tt_active)) { |
326 | if (KMP_TASKING_ENABLED(task_team)) { |
327 | int tasks_completed = FALSE; |
328 | __kmp_atomic_execute_tasks_64( |
329 | thread: this_thr, gtid, flag: (kmp_atomic_flag_64<> *)NULL, FALSE, |
330 | thread_finished: &tasks_completed USE_ITT_BUILD_ARG(itt_sync_obj), is_constrained: 0); |
331 | } else |
332 | this_thr->th.th_reap_state = KMP_SAFE_TO_REAP; |
333 | } |
334 | } else { |
335 | this_thr->th.th_reap_state = KMP_SAFE_TO_REAP; |
336 | } // if |
337 | } |
338 | if (TCR_4(__kmp_global.g.g_done)) { |
339 | if (__kmp_global.g.g_abort) |
340 | __kmp_abort_thread(); |
341 | break; |
342 | } else if (__kmp_tasking_mode != tskm_immediate_exec && |
343 | this_thr->th.th_reap_state == KMP_SAFE_TO_REAP) { |
344 | this_thr->th.th_reap_state = KMP_NOT_SAFE_TO_REAP; |
345 | } |
346 | } while (threads_pending > 0); |
347 | |
348 | if (reduce) { // Perform reduction if needed |
349 | if (KMP_MASTER_TID(tid)) { // Master reduces over group leaders |
350 | OMPT_REDUCTION_DECL(this_thr, gtid); |
351 | OMPT_REDUCTION_BEGIN; |
352 | for (size_t thr = b->threads_per_group; thr < nproc; |
353 | thr += b->threads_per_group) { |
354 | (*reduce)(this_thr->th.th_local.reduce_data, |
355 | other_threads[thr]->th.th_local.reduce_data); |
356 | } |
357 | OMPT_REDUCTION_END; |
358 | } |
359 | } |
360 | } else { |
361 | // Set flag for next iteration |
362 | b->flags[my_next_iter][tid].stillNeed = 1; |
363 | // Each thread uses a different cache line; resets stillNeed to 0 to |
364 | // indicate it has reached the barrier |
365 | b->flags[my_current_iter][tid].stillNeed = 0; |
366 | } |
367 | |
368 | KMP_MFENCE(); |
369 | |
370 | KA_TRACE(20, |
371 | ("__kmp_dist_barrier_gather: T#%d(%d:%d) exit for barrier type %d\n" , |
372 | gtid, team->t.t_id, tid, bt)); |
373 | } |
374 | |
375 | static void __kmp_dist_barrier_release( |
376 | enum barrier_type bt, kmp_info_t *this_thr, int gtid, int tid, |
377 | int propagate_icvs USE_ITT_BUILD_ARG(void *itt_sync_obj)) { |
378 | KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_dist_release); |
379 | kmp_team_t *team; |
380 | distributedBarrier *b; |
381 | kmp_bstate_t *thr_bar; |
382 | kmp_uint64 my_current_iter, next_go; |
383 | size_t my_go_index; |
384 | bool group_leader; |
385 | |
386 | KA_TRACE(20, ("__kmp_dist_barrier_release: T#%d(%d) enter; barrier type %d\n" , |
387 | gtid, tid, bt)); |
388 | |
389 | thr_bar = &this_thr->th.th_bar[bt].bb; |
390 | |
391 | if (!KMP_MASTER_TID(tid)) { |
392 | // workers and non-master group leaders need to check their presence in team |
393 | do { |
394 | if (this_thr->th.th_used_in_team.load() != 1 && |
395 | this_thr->th.th_used_in_team.load() != 3) { |
396 | // Thread is not in use in a team. Wait on location in tid's thread |
397 | // struct. The 0 value tells anyone looking that this thread is spinning |
398 | // or sleeping until this location becomes 3 again; 3 is the transition |
399 | // state to get to 1 which is waiting on go and being in the team |
400 | kmp_flag_32<false, false> my_flag(&(this_thr->th.th_used_in_team), 3); |
401 | if (KMP_COMPARE_AND_STORE_ACQ32(&(this_thr->th.th_used_in_team), 2, |
402 | 0) || |
403 | this_thr->th.th_used_in_team.load() == 0) { |
404 | my_flag.wait(this_thr, final_spin: true USE_ITT_BUILD_ARG(itt_sync_obj)); |
405 | } |
406 | #if USE_ITT_BUILD && USE_ITT_NOTIFY |
407 | if ((__itt_sync_create_ptr && itt_sync_obj == NULL) || KMP_ITT_DEBUG) { |
408 | // In fork barrier where we could not get the object reliably |
409 | itt_sync_obj = |
410 | __kmp_itt_barrier_object(gtid, bt: bs_forkjoin_barrier, set_name: 0, delta: -1); |
411 | // Cancel wait on previous parallel region... |
412 | __kmp_itt_task_starting(object: itt_sync_obj); |
413 | |
414 | if (bt == bs_forkjoin_barrier && TCR_4(__kmp_global.g.g_done)) |
415 | return; |
416 | |
417 | itt_sync_obj = __kmp_itt_barrier_object(gtid, bt: bs_forkjoin_barrier); |
418 | if (itt_sync_obj != NULL) |
419 | // Call prepare as early as possible for "new" barrier |
420 | __kmp_itt_task_finished(object: itt_sync_obj); |
421 | } else |
422 | #endif /* USE_ITT_BUILD && USE_ITT_NOTIFY */ |
423 | if (bt == bs_forkjoin_barrier && TCR_4(__kmp_global.g.g_done)) |
424 | return; |
425 | } |
426 | if (this_thr->th.th_used_in_team.load() != 1 && |
427 | this_thr->th.th_used_in_team.load() != 3) // spurious wake-up? |
428 | continue; |
429 | if (bt == bs_forkjoin_barrier && TCR_4(__kmp_global.g.g_done)) |
430 | return; |
431 | |
432 | // At this point, the thread thinks it is in use in a team, or in |
433 | // transition to be used in a team, but it might have reached this barrier |
434 | // before it was marked unused by the team. Unused threads are awoken and |
435 | // shifted to wait on local thread struct elsewhere. It also might reach |
436 | // this point by being picked up for use by a different team. Either way, |
437 | // we need to update the tid. |
438 | tid = __kmp_tid_from_gtid(gtid); |
439 | team = this_thr->th.th_team; |
440 | KMP_DEBUG_ASSERT(tid >= 0); |
441 | KMP_DEBUG_ASSERT(team); |
442 | b = team->t.b; |
443 | my_current_iter = b->iter[tid].iter; |
444 | next_go = my_current_iter + distributedBarrier::MAX_ITERS; |
445 | my_go_index = tid / b->threads_per_go; |
446 | if (this_thr->th.th_used_in_team.load() == 3) { |
447 | (void)KMP_COMPARE_AND_STORE_ACQ32(&(this_thr->th.th_used_in_team), 3, |
448 | 1); |
449 | } |
450 | // Check if go flag is set |
451 | if (b->go[my_go_index].go.load() != next_go) { |
452 | // Wait on go flag on team |
453 | kmp_atomic_flag_64<false, true> my_flag( |
454 | &(b->go[my_go_index].go), next_go, &(b->sleep[tid].sleep)); |
455 | my_flag.wait(this_thr, final_spin: true USE_ITT_BUILD_ARG(itt_sync_obj)); |
456 | KMP_DEBUG_ASSERT(my_current_iter == b->iter[tid].iter || |
457 | b->iter[tid].iter == 0); |
458 | KMP_DEBUG_ASSERT(b->sleep[tid].sleep == false); |
459 | } |
460 | |
461 | if (bt == bs_forkjoin_barrier && TCR_4(__kmp_global.g.g_done)) |
462 | return; |
463 | // At this point, the thread's go location was set. This means the primary |
464 | // thread is safely in the barrier, and so this thread's data is |
465 | // up-to-date, but we should check again that this thread is really in |
466 | // use in the team, as it could have been woken up for the purpose of |
467 | // changing team size, or reaping threads at shutdown. |
468 | if (this_thr->th.th_used_in_team.load() == 1) |
469 | break; |
470 | } while (1); |
471 | |
472 | if (bt == bs_forkjoin_barrier && TCR_4(__kmp_global.g.g_done)) |
473 | return; |
474 | |
475 | group_leader = ((tid % b->threads_per_group) == 0); |
476 | if (group_leader) { |
477 | // Tell all the threads in my group they can go! |
478 | for (size_t go_idx = my_go_index + 1; |
479 | go_idx < my_go_index + b->gos_per_group; go_idx++) { |
480 | b->go[go_idx].go.store(i: next_go); |
481 | } |
482 | // Fence added so that workers can see changes to go. sfence inadequate. |
483 | KMP_MFENCE(); |
484 | } |
485 | |
486 | #if KMP_BARRIER_ICV_PUSH |
487 | if (propagate_icvs) { // copy ICVs to final dest |
488 | __kmp_init_implicit_task(loc_ref: team->t.t_ident, this_thr: team->t.t_threads[tid], team, |
489 | tid, FALSE); |
490 | copy_icvs(dst: &team->t.t_implicit_task_taskdata[tid].td_icvs, |
491 | src: (kmp_internal_control_t *)team->t.b->team_icvs); |
492 | copy_icvs(dst: &thr_bar->th_fixed_icvs, |
493 | src: &team->t.t_implicit_task_taskdata[tid].td_icvs); |
494 | } |
495 | #endif |
496 | if (__kmp_dflt_blocktime != KMP_MAX_BLOCKTIME && group_leader) { |
497 | // This thread is now awake and participating in the barrier; |
498 | // wake up the other threads in the group |
499 | size_t nproc = this_thr->th.th_team_nproc; |
500 | size_t group_end = tid + b->threads_per_group; |
501 | if (nproc < group_end) |
502 | group_end = nproc; |
503 | __kmp_dist_barrier_wakeup(bt, team, start: tid + 1, stop: group_end, inc: 1, tid); |
504 | } |
505 | } else { // Primary thread |
506 | team = this_thr->th.th_team; |
507 | b = team->t.b; |
508 | my_current_iter = b->iter[tid].iter; |
509 | next_go = my_current_iter + distributedBarrier::MAX_ITERS; |
510 | #if KMP_BARRIER_ICV_PUSH |
511 | if (propagate_icvs) { |
512 | // primary thread has ICVs in final destination; copy |
513 | copy_icvs(dst: &thr_bar->th_fixed_icvs, |
514 | src: &team->t.t_implicit_task_taskdata[tid].td_icvs); |
515 | } |
516 | #endif |
517 | // Tell all the group leaders they can go! |
518 | for (size_t go_idx = 0; go_idx < b->num_gos; go_idx += b->gos_per_group) { |
519 | b->go[go_idx].go.store(i: next_go); |
520 | } |
521 | |
522 | if (__kmp_dflt_blocktime != KMP_MAX_BLOCKTIME) { |
523 | // Wake-up the group leaders |
524 | size_t nproc = this_thr->th.th_team_nproc; |
525 | __kmp_dist_barrier_wakeup(bt, team, start: tid + b->threads_per_group, stop: nproc, |
526 | inc: b->threads_per_group, tid); |
527 | } |
528 | |
529 | // Tell all the threads in my group they can go! |
530 | for (size_t go_idx = 1; go_idx < b->gos_per_group; go_idx++) { |
531 | b->go[go_idx].go.store(i: next_go); |
532 | } |
533 | |
534 | // Fence added so that workers can see changes to go. sfence inadequate. |
535 | KMP_MFENCE(); |
536 | |
537 | if (__kmp_dflt_blocktime != KMP_MAX_BLOCKTIME) { |
538 | // Wake-up the other threads in my group |
539 | size_t nproc = this_thr->th.th_team_nproc; |
540 | size_t group_end = tid + b->threads_per_group; |
541 | if (nproc < group_end) |
542 | group_end = nproc; |
543 | __kmp_dist_barrier_wakeup(bt, team, start: tid + 1, stop: group_end, inc: 1, tid); |
544 | } |
545 | } |
546 | // Update to next iteration |
547 | KMP_ASSERT(my_current_iter == b->iter[tid].iter); |
548 | b->iter[tid].iter = (b->iter[tid].iter + 1) % distributedBarrier::MAX_ITERS; |
549 | |
550 | KA_TRACE( |
551 | 20, ("__kmp_dist_barrier_release: T#%d(%d:%d) exit for barrier type %d\n" , |
552 | gtid, team->t.t_id, tid, bt)); |
553 | } |
554 | |
555 | // Linear Barrier |
556 | template <bool cancellable = false> |
557 | static bool __kmp_linear_barrier_gather_template( |
558 | enum barrier_type bt, kmp_info_t *this_thr, int gtid, int tid, |
559 | void (*reduce)(void *, void *) USE_ITT_BUILD_ARG(void *itt_sync_obj)) { |
560 | KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_linear_gather); |
561 | kmp_team_t *team = this_thr->th.th_team; |
562 | kmp_bstate_t *thr_bar = &this_thr->th.th_bar[bt].bb; |
563 | kmp_info_t **other_threads = team->t.t_threads; |
564 | |
565 | KA_TRACE( |
566 | 20, |
567 | ("__kmp_linear_barrier_gather: T#%d(%d:%d) enter for barrier type %d\n" , |
568 | gtid, team->t.t_id, tid, bt)); |
569 | KMP_DEBUG_ASSERT(this_thr == other_threads[this_thr->th.th_info.ds.ds_tid]); |
570 | |
571 | #if USE_ITT_BUILD && USE_ITT_NOTIFY |
572 | // Barrier imbalance - save arrive time to the thread |
573 | if (__kmp_forkjoin_frames_mode == 3 || __kmp_forkjoin_frames_mode == 2) { |
574 | this_thr->th.th_bar_arrive_time = this_thr->th.th_bar_min_time = |
575 | __itt_get_timestamp(); |
576 | } |
577 | #endif |
578 | // We now perform a linear reduction to signal that all of the threads have |
579 | // arrived. |
580 | if (!KMP_MASTER_TID(tid)) { |
581 | KA_TRACE(20, |
582 | ("__kmp_linear_barrier_gather: T#%d(%d:%d) releasing T#%d(%d:%d)" |
583 | "arrived(%p): %llu => %llu\n" , |
584 | gtid, team->t.t_id, tid, __kmp_gtid_from_tid(0, team), |
585 | team->t.t_id, 0, &thr_bar->b_arrived, thr_bar->b_arrived, |
586 | thr_bar->b_arrived + KMP_BARRIER_STATE_BUMP)); |
587 | // Mark arrival to primary thread |
588 | /* After performing this write, a worker thread may not assume that the team |
589 | is valid any more - it could be deallocated by the primary thread at any |
590 | time. */ |
591 | kmp_flag_64<> flag(&thr_bar->b_arrived, other_threads[0]); |
592 | flag.release(); |
593 | } else { |
594 | kmp_balign_team_t *team_bar = &team->t.t_bar[bt]; |
595 | int nproc = this_thr->th.th_team_nproc; |
596 | int i; |
597 | // Don't have to worry about sleep bit here or atomic since team setting |
598 | kmp_uint64 new_state = team_bar->b_arrived + KMP_BARRIER_STATE_BUMP; |
599 | |
600 | // Collect all the worker team member threads. |
601 | for (i = 1; i < nproc; ++i) { |
602 | #if KMP_CACHE_MANAGE |
603 | // Prefetch next thread's arrived count |
604 | if (i + 1 < nproc) |
605 | KMP_CACHE_PREFETCH(&other_threads[i + 1]->th.th_bar[bt].bb.b_arrived); |
606 | #endif /* KMP_CACHE_MANAGE */ |
607 | KA_TRACE(20, ("__kmp_linear_barrier_gather: T#%d(%d:%d) wait T#%d(%d:%d) " |
608 | "arrived(%p) == %llu\n" , |
609 | gtid, team->t.t_id, tid, __kmp_gtid_from_tid(i, team), |
610 | team->t.t_id, i, |
611 | &other_threads[i]->th.th_bar[bt].bb.b_arrived, new_state)); |
612 | |
613 | // Wait for worker thread to arrive |
614 | if (cancellable) { |
615 | kmp_flag_64<true, false> flag( |
616 | &other_threads[i]->th.th_bar[bt].bb.b_arrived, new_state); |
617 | if (flag.wait(this_thr, FALSE USE_ITT_BUILD_ARG(itt_sync_obj))) |
618 | return true; |
619 | } else { |
620 | kmp_flag_64<> flag(&other_threads[i]->th.th_bar[bt].bb.b_arrived, |
621 | new_state); |
622 | flag.wait(this_thr, FALSE USE_ITT_BUILD_ARG(itt_sync_obj)); |
623 | } |
624 | #if USE_ITT_BUILD && USE_ITT_NOTIFY |
625 | // Barrier imbalance - write min of the thread time and the other thread |
626 | // time to the thread. |
627 | if (__kmp_forkjoin_frames_mode == 2) { |
628 | this_thr->th.th_bar_min_time = KMP_MIN( |
629 | this_thr->th.th_bar_min_time, other_threads[i]->th.th_bar_min_time); |
630 | } |
631 | #endif |
632 | if (reduce) { |
633 | KA_TRACE(100, |
634 | ("__kmp_linear_barrier_gather: T#%d(%d:%d) += T#%d(%d:%d)\n" , |
635 | gtid, team->t.t_id, tid, __kmp_gtid_from_tid(i, team), |
636 | team->t.t_id, i)); |
637 | OMPT_REDUCTION_DECL(this_thr, gtid); |
638 | OMPT_REDUCTION_BEGIN; |
639 | (*reduce)(this_thr->th.th_local.reduce_data, |
640 | other_threads[i]->th.th_local.reduce_data); |
641 | OMPT_REDUCTION_END; |
642 | } |
643 | } |
644 | // Don't have to worry about sleep bit here or atomic since team setting |
645 | team_bar->b_arrived = new_state; |
646 | KA_TRACE(20, ("__kmp_linear_barrier_gather: T#%d(%d:%d) set team %d " |
647 | "arrived(%p) = %llu\n" , |
648 | gtid, team->t.t_id, tid, team->t.t_id, &team_bar->b_arrived, |
649 | new_state)); |
650 | } |
651 | KA_TRACE( |
652 | 20, |
653 | ("__kmp_linear_barrier_gather: T#%d(%d:%d) exit for barrier type %d\n" , |
654 | gtid, team->t.t_id, tid, bt)); |
655 | return false; |
656 | } |
657 | |
658 | template <bool cancellable = false> |
659 | static bool __kmp_linear_barrier_release_template( |
660 | enum barrier_type bt, kmp_info_t *this_thr, int gtid, int tid, |
661 | int propagate_icvs USE_ITT_BUILD_ARG(void *itt_sync_obj)) { |
662 | KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_linear_release); |
663 | kmp_bstate_t *thr_bar = &this_thr->th.th_bar[bt].bb; |
664 | kmp_team_t *team; |
665 | |
666 | if (KMP_MASTER_TID(tid)) { |
667 | unsigned int i; |
668 | kmp_uint32 nproc = this_thr->th.th_team_nproc; |
669 | kmp_info_t **other_threads; |
670 | |
671 | team = __kmp_threads[gtid]->th.th_team; |
672 | KMP_DEBUG_ASSERT(team != NULL); |
673 | other_threads = team->t.t_threads; |
674 | |
675 | KA_TRACE(20, ("__kmp_linear_barrier_release: T#%d(%d:%d) primary enter for " |
676 | "barrier type %d\n" , |
677 | gtid, team->t.t_id, tid, bt)); |
678 | |
679 | if (nproc > 1) { |
680 | #if KMP_BARRIER_ICV_PUSH |
681 | { |
682 | KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(USER_icv_copy); |
683 | if (propagate_icvs) { |
684 | ngo_load(&team->t.t_implicit_task_taskdata[0].td_icvs); |
685 | for (i = 1; i < nproc; ++i) { |
686 | __kmp_init_implicit_task(loc_ref: team->t.t_ident, this_thr: team->t.t_threads[i], |
687 | team, tid: i, FALSE); |
688 | ngo_store_icvs(&team->t.t_implicit_task_taskdata[i].td_icvs, |
689 | &team->t.t_implicit_task_taskdata[0].td_icvs); |
690 | } |
691 | ngo_sync(); |
692 | } |
693 | } |
694 | #endif // KMP_BARRIER_ICV_PUSH |
695 | |
696 | // Now, release all of the worker threads |
697 | for (i = 1; i < nproc; ++i) { |
698 | #if KMP_CACHE_MANAGE |
699 | // Prefetch next thread's go flag |
700 | if (i + 1 < nproc) |
701 | KMP_CACHE_PREFETCH(&other_threads[i + 1]->th.th_bar[bt].bb.b_go); |
702 | #endif /* KMP_CACHE_MANAGE */ |
703 | KA_TRACE( |
704 | 20, |
705 | ("__kmp_linear_barrier_release: T#%d(%d:%d) releasing T#%d(%d:%d) " |
706 | "go(%p): %u => %u\n" , |
707 | gtid, team->t.t_id, tid, other_threads[i]->th.th_info.ds.ds_gtid, |
708 | team->t.t_id, i, &other_threads[i]->th.th_bar[bt].bb.b_go, |
709 | other_threads[i]->th.th_bar[bt].bb.b_go, |
710 | other_threads[i]->th.th_bar[bt].bb.b_go + KMP_BARRIER_STATE_BUMP)); |
711 | kmp_flag_64<> flag(&other_threads[i]->th.th_bar[bt].bb.b_go, |
712 | other_threads[i]); |
713 | flag.release(); |
714 | } |
715 | } |
716 | } else { // Wait for the PRIMARY thread to release us |
717 | KA_TRACE(20, ("__kmp_linear_barrier_release: T#%d wait go(%p) == %u\n" , |
718 | gtid, &thr_bar->b_go, KMP_BARRIER_STATE_BUMP)); |
719 | if (cancellable) { |
720 | kmp_flag_64<true, false> flag(&thr_bar->b_go, KMP_BARRIER_STATE_BUMP); |
721 | if (flag.wait(this_thr, TRUE USE_ITT_BUILD_ARG(itt_sync_obj))) |
722 | return true; |
723 | } else { |
724 | kmp_flag_64<> flag(&thr_bar->b_go, KMP_BARRIER_STATE_BUMP); |
725 | flag.wait(this_thr, TRUE USE_ITT_BUILD_ARG(itt_sync_obj)); |
726 | } |
727 | #if USE_ITT_BUILD && USE_ITT_NOTIFY |
728 | if ((__itt_sync_create_ptr && itt_sync_obj == NULL) || KMP_ITT_DEBUG) { |
729 | // In a fork barrier; cannot get the object reliably (or ITTNOTIFY is |
730 | // disabled) |
731 | itt_sync_obj = __kmp_itt_barrier_object(gtid, bt: bs_forkjoin_barrier, set_name: 0, delta: -1); |
732 | // Cancel wait on previous parallel region... |
733 | __kmp_itt_task_starting(object: itt_sync_obj); |
734 | |
735 | if (bt == bs_forkjoin_barrier && TCR_4(__kmp_global.g.g_done)) |
736 | return false; |
737 | |
738 | itt_sync_obj = __kmp_itt_barrier_object(gtid, bt: bs_forkjoin_barrier); |
739 | if (itt_sync_obj != NULL) |
740 | // Call prepare as early as possible for "new" barrier |
741 | __kmp_itt_task_finished(object: itt_sync_obj); |
742 | } else |
743 | #endif /* USE_ITT_BUILD && USE_ITT_NOTIFY */ |
744 | // Early exit for reaping threads releasing forkjoin barrier |
745 | if (bt == bs_forkjoin_barrier && TCR_4(__kmp_global.g.g_done)) |
746 | return false; |
747 | // The worker thread may now assume that the team is valid. |
748 | #ifdef KMP_DEBUG |
749 | tid = __kmp_tid_from_gtid(gtid); |
750 | team = __kmp_threads[gtid]->th.th_team; |
751 | #endif |
752 | KMP_DEBUG_ASSERT(team != NULL); |
753 | TCW_4(thr_bar->b_go, KMP_INIT_BARRIER_STATE); |
754 | KA_TRACE(20, |
755 | ("__kmp_linear_barrier_release: T#%d(%d:%d) set go(%p) = %u\n" , |
756 | gtid, team->t.t_id, tid, &thr_bar->b_go, KMP_INIT_BARRIER_STATE)); |
757 | KMP_MB(); // Flush all pending memory write invalidates. |
758 | } |
759 | KA_TRACE( |
760 | 20, |
761 | ("__kmp_linear_barrier_release: T#%d(%d:%d) exit for barrier type %d\n" , |
762 | gtid, team->t.t_id, tid, bt)); |
763 | return false; |
764 | } |
765 | |
766 | static void __kmp_linear_barrier_gather( |
767 | enum barrier_type bt, kmp_info_t *this_thr, int gtid, int tid, |
768 | void (*reduce)(void *, void *) USE_ITT_BUILD_ARG(void *itt_sync_obj)) { |
769 | __kmp_linear_barrier_gather_template<false>( |
770 | bt, this_thr, gtid, tid, reduce USE_ITT_BUILD_ARG(itt_sync_obj)); |
771 | } |
772 | |
773 | static bool __kmp_linear_barrier_gather_cancellable( |
774 | enum barrier_type bt, kmp_info_t *this_thr, int gtid, int tid, |
775 | void (*reduce)(void *, void *) USE_ITT_BUILD_ARG(void *itt_sync_obj)) { |
776 | return __kmp_linear_barrier_gather_template<true>( |
777 | bt, this_thr, gtid, tid, reduce USE_ITT_BUILD_ARG(itt_sync_obj)); |
778 | } |
779 | |
780 | static void __kmp_linear_barrier_release( |
781 | enum barrier_type bt, kmp_info_t *this_thr, int gtid, int tid, |
782 | int propagate_icvs USE_ITT_BUILD_ARG(void *itt_sync_obj)) { |
783 | __kmp_linear_barrier_release_template<false>( |
784 | bt, this_thr, gtid, tid, propagate_icvs USE_ITT_BUILD_ARG(itt_sync_obj)); |
785 | } |
786 | |
787 | static bool __kmp_linear_barrier_release_cancellable( |
788 | enum barrier_type bt, kmp_info_t *this_thr, int gtid, int tid, |
789 | int propagate_icvs USE_ITT_BUILD_ARG(void *itt_sync_obj)) { |
790 | return __kmp_linear_barrier_release_template<true>( |
791 | bt, this_thr, gtid, tid, propagate_icvs USE_ITT_BUILD_ARG(itt_sync_obj)); |
792 | } |
793 | |
794 | // Tree barrier |
795 | static void __kmp_tree_barrier_gather( |
796 | enum barrier_type bt, kmp_info_t *this_thr, int gtid, int tid, |
797 | void (*reduce)(void *, void *) USE_ITT_BUILD_ARG(void *itt_sync_obj)) { |
798 | KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_tree_gather); |
799 | kmp_team_t *team = this_thr->th.th_team; |
800 | kmp_bstate_t *thr_bar = &this_thr->th.th_bar[bt].bb; |
801 | kmp_info_t **other_threads = team->t.t_threads; |
802 | kmp_uint32 nproc = this_thr->th.th_team_nproc; |
803 | kmp_uint32 branch_bits = __kmp_barrier_gather_branch_bits[bt]; |
804 | kmp_uint32 branch_factor = 1 << branch_bits; |
805 | kmp_uint32 child; |
806 | kmp_uint32 child_tid; |
807 | kmp_uint64 new_state = 0; |
808 | |
809 | KA_TRACE( |
810 | 20, ("__kmp_tree_barrier_gather: T#%d(%d:%d) enter for barrier type %d\n" , |
811 | gtid, team->t.t_id, tid, bt)); |
812 | KMP_DEBUG_ASSERT(this_thr == other_threads[this_thr->th.th_info.ds.ds_tid]); |
813 | |
814 | #if USE_ITT_BUILD && USE_ITT_NOTIFY |
815 | // Barrier imbalance - save arrive time to the thread |
816 | if (__kmp_forkjoin_frames_mode == 3 || __kmp_forkjoin_frames_mode == 2) { |
817 | this_thr->th.th_bar_arrive_time = this_thr->th.th_bar_min_time = |
818 | __itt_get_timestamp(); |
819 | } |
820 | #endif |
821 | // Perform tree gather to wait until all threads have arrived; reduce any |
822 | // required data as we go |
823 | child_tid = (tid << branch_bits) + 1; |
824 | if (child_tid < nproc) { |
825 | // Parent threads wait for all their children to arrive |
826 | new_state = team->t.t_bar[bt].b_arrived + KMP_BARRIER_STATE_BUMP; |
827 | child = 1; |
828 | do { |
829 | kmp_info_t *child_thr = other_threads[child_tid]; |
830 | kmp_bstate_t *child_bar = &child_thr->th.th_bar[bt].bb; |
831 | #if KMP_CACHE_MANAGE |
832 | // Prefetch next thread's arrived count |
833 | if (child + 1 <= branch_factor && child_tid + 1 < nproc) |
834 | KMP_CACHE_PREFETCH( |
835 | &other_threads[child_tid + 1]->th.th_bar[bt].bb.b_arrived); |
836 | #endif /* KMP_CACHE_MANAGE */ |
837 | KA_TRACE(20, |
838 | ("__kmp_tree_barrier_gather: T#%d(%d:%d) wait T#%d(%d:%u) " |
839 | "arrived(%p) == %llu\n" , |
840 | gtid, team->t.t_id, tid, __kmp_gtid_from_tid(child_tid, team), |
841 | team->t.t_id, child_tid, &child_bar->b_arrived, new_state)); |
842 | // Wait for child to arrive |
843 | kmp_flag_64<> flag(&child_bar->b_arrived, new_state); |
844 | flag.wait(this_thr, FALSE USE_ITT_BUILD_ARG(itt_sync_obj)); |
845 | #if USE_ITT_BUILD && USE_ITT_NOTIFY |
846 | // Barrier imbalance - write min of the thread time and a child time to |
847 | // the thread. |
848 | if (__kmp_forkjoin_frames_mode == 2) { |
849 | this_thr->th.th_bar_min_time = KMP_MIN(this_thr->th.th_bar_min_time, |
850 | child_thr->th.th_bar_min_time); |
851 | } |
852 | #endif |
853 | if (reduce) { |
854 | KA_TRACE(100, |
855 | ("__kmp_tree_barrier_gather: T#%d(%d:%d) += T#%d(%d:%u)\n" , |
856 | gtid, team->t.t_id, tid, __kmp_gtid_from_tid(child_tid, team), |
857 | team->t.t_id, child_tid)); |
858 | OMPT_REDUCTION_DECL(this_thr, gtid); |
859 | OMPT_REDUCTION_BEGIN; |
860 | (*reduce)(this_thr->th.th_local.reduce_data, |
861 | child_thr->th.th_local.reduce_data); |
862 | OMPT_REDUCTION_END; |
863 | } |
864 | child++; |
865 | child_tid++; |
866 | } while (child <= branch_factor && child_tid < nproc); |
867 | } |
868 | |
869 | if (!KMP_MASTER_TID(tid)) { // Worker threads |
870 | kmp_int32 parent_tid = (tid - 1) >> branch_bits; |
871 | |
872 | KA_TRACE(20, |
873 | ("__kmp_tree_barrier_gather: T#%d(%d:%d) releasing T#%d(%d:%d) " |
874 | "arrived(%p): %llu => %llu\n" , |
875 | gtid, team->t.t_id, tid, __kmp_gtid_from_tid(parent_tid, team), |
876 | team->t.t_id, parent_tid, &thr_bar->b_arrived, thr_bar->b_arrived, |
877 | thr_bar->b_arrived + KMP_BARRIER_STATE_BUMP)); |
878 | |
879 | // Mark arrival to parent thread |
880 | /* After performing this write, a worker thread may not assume that the team |
881 | is valid any more - it could be deallocated by the primary thread at any |
882 | time. */ |
883 | kmp_flag_64<> flag(&thr_bar->b_arrived, other_threads[parent_tid]); |
884 | flag.release(); |
885 | } else { |
886 | // Need to update the team arrived pointer if we are the primary thread |
887 | if (nproc > 1) // New value was already computed above |
888 | team->t.t_bar[bt].b_arrived = new_state; |
889 | else |
890 | team->t.t_bar[bt].b_arrived += KMP_BARRIER_STATE_BUMP; |
891 | KA_TRACE(20, ("__kmp_tree_barrier_gather: T#%d(%d:%d) set team %d " |
892 | "arrived(%p) = %llu\n" , |
893 | gtid, team->t.t_id, tid, team->t.t_id, |
894 | &team->t.t_bar[bt].b_arrived, team->t.t_bar[bt].b_arrived)); |
895 | } |
896 | KA_TRACE(20, |
897 | ("__kmp_tree_barrier_gather: T#%d(%d:%d) exit for barrier type %d\n" , |
898 | gtid, team->t.t_id, tid, bt)); |
899 | } |
900 | |
901 | static void __kmp_tree_barrier_release( |
902 | enum barrier_type bt, kmp_info_t *this_thr, int gtid, int tid, |
903 | int propagate_icvs USE_ITT_BUILD_ARG(void *itt_sync_obj)) { |
904 | KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_tree_release); |
905 | kmp_team_t *team; |
906 | kmp_bstate_t *thr_bar = &this_thr->th.th_bar[bt].bb; |
907 | kmp_uint32 nproc; |
908 | kmp_uint32 branch_bits = __kmp_barrier_release_branch_bits[bt]; |
909 | kmp_uint32 branch_factor = 1 << branch_bits; |
910 | kmp_uint32 child; |
911 | kmp_uint32 child_tid; |
912 | |
913 | // Perform a tree release for all of the threads that have been gathered |
914 | if (!KMP_MASTER_TID( |
915 | tid)) { // Handle fork barrier workers who aren't part of a team yet |
916 | KA_TRACE(20, ("__kmp_tree_barrier_release: T#%d wait go(%p) == %u\n" , gtid, |
917 | &thr_bar->b_go, KMP_BARRIER_STATE_BUMP)); |
918 | // Wait for parent thread to release us |
919 | kmp_flag_64<> flag(&thr_bar->b_go, KMP_BARRIER_STATE_BUMP); |
920 | flag.wait(this_thr, TRUE USE_ITT_BUILD_ARG(itt_sync_obj)); |
921 | #if USE_ITT_BUILD && USE_ITT_NOTIFY |
922 | if ((__itt_sync_create_ptr && itt_sync_obj == NULL) || KMP_ITT_DEBUG) { |
923 | // In fork barrier where we could not get the object reliably (or |
924 | // ITTNOTIFY is disabled) |
925 | itt_sync_obj = __kmp_itt_barrier_object(gtid, bt: bs_forkjoin_barrier, set_name: 0, delta: -1); |
926 | // Cancel wait on previous parallel region... |
927 | __kmp_itt_task_starting(object: itt_sync_obj); |
928 | |
929 | if (bt == bs_forkjoin_barrier && TCR_4(__kmp_global.g.g_done)) |
930 | return; |
931 | |
932 | itt_sync_obj = __kmp_itt_barrier_object(gtid, bt: bs_forkjoin_barrier); |
933 | if (itt_sync_obj != NULL) |
934 | // Call prepare as early as possible for "new" barrier |
935 | __kmp_itt_task_finished(object: itt_sync_obj); |
936 | } else |
937 | #endif /* USE_ITT_BUILD && USE_ITT_NOTIFY */ |
938 | // Early exit for reaping threads releasing forkjoin barrier |
939 | if (bt == bs_forkjoin_barrier && TCR_4(__kmp_global.g.g_done)) |
940 | return; |
941 | |
942 | // The worker thread may now assume that the team is valid. |
943 | team = __kmp_threads[gtid]->th.th_team; |
944 | KMP_DEBUG_ASSERT(team != NULL); |
945 | tid = __kmp_tid_from_gtid(gtid); |
946 | |
947 | TCW_4(thr_bar->b_go, KMP_INIT_BARRIER_STATE); |
948 | KA_TRACE(20, |
949 | ("__kmp_tree_barrier_release: T#%d(%d:%d) set go(%p) = %u\n" , gtid, |
950 | team->t.t_id, tid, &thr_bar->b_go, KMP_INIT_BARRIER_STATE)); |
951 | KMP_MB(); // Flush all pending memory write invalidates. |
952 | } else { |
953 | team = __kmp_threads[gtid]->th.th_team; |
954 | KMP_DEBUG_ASSERT(team != NULL); |
955 | KA_TRACE(20, ("__kmp_tree_barrier_release: T#%d(%d:%d) primary enter for " |
956 | "barrier type %d\n" , |
957 | gtid, team->t.t_id, tid, bt)); |
958 | } |
959 | nproc = this_thr->th.th_team_nproc; |
960 | child_tid = (tid << branch_bits) + 1; |
961 | |
962 | if (child_tid < nproc) { |
963 | kmp_info_t **other_threads = team->t.t_threads; |
964 | child = 1; |
965 | // Parent threads release all their children |
966 | do { |
967 | kmp_info_t *child_thr = other_threads[child_tid]; |
968 | kmp_bstate_t *child_bar = &child_thr->th.th_bar[bt].bb; |
969 | #if KMP_CACHE_MANAGE |
970 | // Prefetch next thread's go count |
971 | if (child + 1 <= branch_factor && child_tid + 1 < nproc) |
972 | KMP_CACHE_PREFETCH( |
973 | &other_threads[child_tid + 1]->th.th_bar[bt].bb.b_go); |
974 | #endif /* KMP_CACHE_MANAGE */ |
975 | |
976 | #if KMP_BARRIER_ICV_PUSH |
977 | { |
978 | KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(USER_icv_copy); |
979 | if (propagate_icvs) { |
980 | __kmp_init_implicit_task(loc_ref: team->t.t_ident, |
981 | this_thr: team->t.t_threads[child_tid], team, |
982 | tid: child_tid, FALSE); |
983 | copy_icvs(dst: &team->t.t_implicit_task_taskdata[child_tid].td_icvs, |
984 | src: &team->t.t_implicit_task_taskdata[0].td_icvs); |
985 | } |
986 | } |
987 | #endif // KMP_BARRIER_ICV_PUSH |
988 | KA_TRACE(20, |
989 | ("__kmp_tree_barrier_release: T#%d(%d:%d) releasing T#%d(%d:%u)" |
990 | "go(%p): %u => %u\n" , |
991 | gtid, team->t.t_id, tid, __kmp_gtid_from_tid(child_tid, team), |
992 | team->t.t_id, child_tid, &child_bar->b_go, child_bar->b_go, |
993 | child_bar->b_go + KMP_BARRIER_STATE_BUMP)); |
994 | // Release child from barrier |
995 | kmp_flag_64<> flag(&child_bar->b_go, child_thr); |
996 | flag.release(); |
997 | child++; |
998 | child_tid++; |
999 | } while (child <= branch_factor && child_tid < nproc); |
1000 | } |
1001 | KA_TRACE( |
1002 | 20, ("__kmp_tree_barrier_release: T#%d(%d:%d) exit for barrier type %d\n" , |
1003 | gtid, team->t.t_id, tid, bt)); |
1004 | } |
1005 | |
1006 | // Hyper Barrier |
1007 | static void __kmp_hyper_barrier_gather( |
1008 | enum barrier_type bt, kmp_info_t *this_thr, int gtid, int tid, |
1009 | void (*reduce)(void *, void *) USE_ITT_BUILD_ARG(void *itt_sync_obj)) { |
1010 | KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_hyper_gather); |
1011 | kmp_team_t *team = this_thr->th.th_team; |
1012 | kmp_bstate_t *thr_bar = &this_thr->th.th_bar[bt].bb; |
1013 | kmp_info_t **other_threads = team->t.t_threads; |
1014 | kmp_uint64 new_state = KMP_BARRIER_UNUSED_STATE; |
1015 | kmp_uint32 num_threads = this_thr->th.th_team_nproc; |
1016 | kmp_uint32 branch_bits = __kmp_barrier_gather_branch_bits[bt]; |
1017 | kmp_uint32 branch_factor = 1 << branch_bits; |
1018 | kmp_uint32 offset; |
1019 | kmp_uint32 level; |
1020 | |
1021 | KA_TRACE( |
1022 | 20, |
1023 | ("__kmp_hyper_barrier_gather: T#%d(%d:%d) enter for barrier type %d\n" , |
1024 | gtid, team->t.t_id, tid, bt)); |
1025 | KMP_DEBUG_ASSERT(this_thr == other_threads[this_thr->th.th_info.ds.ds_tid]); |
1026 | |
1027 | #if USE_ITT_BUILD && USE_ITT_NOTIFY |
1028 | // Barrier imbalance - save arrive time to the thread |
1029 | if (__kmp_forkjoin_frames_mode == 3 || __kmp_forkjoin_frames_mode == 2) { |
1030 | this_thr->th.th_bar_arrive_time = this_thr->th.th_bar_min_time = |
1031 | __itt_get_timestamp(); |
1032 | } |
1033 | #endif |
1034 | /* Perform a hypercube-embedded tree gather to wait until all of the threads |
1035 | have arrived, and reduce any required data as we go. */ |
1036 | kmp_flag_64<> p_flag(&thr_bar->b_arrived); |
1037 | for (level = 0, offset = 1; offset < num_threads; |
1038 | level += branch_bits, offset <<= branch_bits) { |
1039 | kmp_uint32 child; |
1040 | kmp_uint32 child_tid; |
1041 | |
1042 | if (((tid >> level) & (branch_factor - 1)) != 0) { |
1043 | kmp_int32 parent_tid = tid & ~((1 << (level + branch_bits)) - 1); |
1044 | |
1045 | KMP_MB(); // Synchronize parent and child threads. |
1046 | KA_TRACE(20, |
1047 | ("__kmp_hyper_barrier_gather: T#%d(%d:%d) releasing T#%d(%d:%d) " |
1048 | "arrived(%p): %llu => %llu\n" , |
1049 | gtid, team->t.t_id, tid, __kmp_gtid_from_tid(parent_tid, team), |
1050 | team->t.t_id, parent_tid, &thr_bar->b_arrived, |
1051 | thr_bar->b_arrived, |
1052 | thr_bar->b_arrived + KMP_BARRIER_STATE_BUMP)); |
1053 | // Mark arrival to parent thread |
1054 | /* After performing this write (in the last iteration of the enclosing for |
1055 | loop), a worker thread may not assume that the team is valid any more |
1056 | - it could be deallocated by the primary thread at any time. */ |
1057 | p_flag.set_waiter(other_threads[parent_tid]); |
1058 | p_flag.release(); |
1059 | break; |
1060 | } |
1061 | |
1062 | // Parent threads wait for children to arrive |
1063 | if (new_state == KMP_BARRIER_UNUSED_STATE) |
1064 | new_state = team->t.t_bar[bt].b_arrived + KMP_BARRIER_STATE_BUMP; |
1065 | for (child = 1, child_tid = tid + (1 << level); |
1066 | child < branch_factor && child_tid < num_threads; |
1067 | child++, child_tid += (1 << level)) { |
1068 | kmp_info_t *child_thr = other_threads[child_tid]; |
1069 | kmp_bstate_t *child_bar = &child_thr->th.th_bar[bt].bb; |
1070 | #if KMP_CACHE_MANAGE |
1071 | kmp_uint32 next_child_tid = child_tid + (1 << level); |
1072 | // Prefetch next thread's arrived count |
1073 | if (child + 1 < branch_factor && next_child_tid < num_threads) |
1074 | KMP_CACHE_PREFETCH( |
1075 | &other_threads[next_child_tid]->th.th_bar[bt].bb.b_arrived); |
1076 | #endif /* KMP_CACHE_MANAGE */ |
1077 | KA_TRACE(20, |
1078 | ("__kmp_hyper_barrier_gather: T#%d(%d:%d) wait T#%d(%d:%u) " |
1079 | "arrived(%p) == %llu\n" , |
1080 | gtid, team->t.t_id, tid, __kmp_gtid_from_tid(child_tid, team), |
1081 | team->t.t_id, child_tid, &child_bar->b_arrived, new_state)); |
1082 | // Wait for child to arrive |
1083 | kmp_flag_64<> c_flag(&child_bar->b_arrived, new_state); |
1084 | c_flag.wait(this_thr, FALSE USE_ITT_BUILD_ARG(itt_sync_obj)); |
1085 | KMP_MB(); // Synchronize parent and child threads. |
1086 | #if USE_ITT_BUILD && USE_ITT_NOTIFY |
1087 | // Barrier imbalance - write min of the thread time and a child time to |
1088 | // the thread. |
1089 | if (__kmp_forkjoin_frames_mode == 2) { |
1090 | this_thr->th.th_bar_min_time = KMP_MIN(this_thr->th.th_bar_min_time, |
1091 | child_thr->th.th_bar_min_time); |
1092 | } |
1093 | #endif |
1094 | if (reduce) { |
1095 | KA_TRACE(100, |
1096 | ("__kmp_hyper_barrier_gather: T#%d(%d:%d) += T#%d(%d:%u)\n" , |
1097 | gtid, team->t.t_id, tid, __kmp_gtid_from_tid(child_tid, team), |
1098 | team->t.t_id, child_tid)); |
1099 | OMPT_REDUCTION_DECL(this_thr, gtid); |
1100 | OMPT_REDUCTION_BEGIN; |
1101 | (*reduce)(this_thr->th.th_local.reduce_data, |
1102 | child_thr->th.th_local.reduce_data); |
1103 | OMPT_REDUCTION_END; |
1104 | } |
1105 | } |
1106 | } |
1107 | |
1108 | if (KMP_MASTER_TID(tid)) { |
1109 | // Need to update the team arrived pointer if we are the primary thread |
1110 | if (new_state == KMP_BARRIER_UNUSED_STATE) |
1111 | team->t.t_bar[bt].b_arrived += KMP_BARRIER_STATE_BUMP; |
1112 | else |
1113 | team->t.t_bar[bt].b_arrived = new_state; |
1114 | KA_TRACE(20, ("__kmp_hyper_barrier_gather: T#%d(%d:%d) set team %d " |
1115 | "arrived(%p) = %llu\n" , |
1116 | gtid, team->t.t_id, tid, team->t.t_id, |
1117 | &team->t.t_bar[bt].b_arrived, team->t.t_bar[bt].b_arrived)); |
1118 | } |
1119 | KA_TRACE( |
1120 | 20, ("__kmp_hyper_barrier_gather: T#%d(%d:%d) exit for barrier type %d\n" , |
1121 | gtid, team->t.t_id, tid, bt)); |
1122 | } |
1123 | |
1124 | // The reverse versions seem to beat the forward versions overall |
1125 | #define KMP_REVERSE_HYPER_BAR |
1126 | static void __kmp_hyper_barrier_release( |
1127 | enum barrier_type bt, kmp_info_t *this_thr, int gtid, int tid, |
1128 | int propagate_icvs USE_ITT_BUILD_ARG(void *itt_sync_obj)) { |
1129 | KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_hyper_release); |
1130 | kmp_team_t *team; |
1131 | kmp_bstate_t *thr_bar = &this_thr->th.th_bar[bt].bb; |
1132 | kmp_info_t **other_threads; |
1133 | kmp_uint32 num_threads; |
1134 | kmp_uint32 branch_bits = __kmp_barrier_release_branch_bits[bt]; |
1135 | kmp_uint32 branch_factor = 1 << branch_bits; |
1136 | kmp_uint32 child; |
1137 | kmp_uint32 child_tid; |
1138 | kmp_uint32 offset; |
1139 | kmp_uint32 level; |
1140 | |
1141 | /* Perform a hypercube-embedded tree release for all of the threads that have |
1142 | been gathered. If KMP_REVERSE_HYPER_BAR is defined (default) the threads |
1143 | are released in the reverse order of the corresponding gather, otherwise |
1144 | threads are released in the same order. */ |
1145 | if (KMP_MASTER_TID(tid)) { // primary thread |
1146 | team = __kmp_threads[gtid]->th.th_team; |
1147 | KMP_DEBUG_ASSERT(team != NULL); |
1148 | KA_TRACE(20, ("__kmp_hyper_barrier_release: T#%d(%d:%d) primary enter for " |
1149 | "barrier type %d\n" , |
1150 | gtid, team->t.t_id, tid, bt)); |
1151 | #if KMP_BARRIER_ICV_PUSH |
1152 | if (propagate_icvs) { // primary already has ICVs in final destination; copy |
1153 | copy_icvs(dst: &thr_bar->th_fixed_icvs, |
1154 | src: &team->t.t_implicit_task_taskdata[tid].td_icvs); |
1155 | } |
1156 | #endif |
1157 | } else { // Handle fork barrier workers who aren't part of a team yet |
1158 | KA_TRACE(20, ("__kmp_hyper_barrier_release: T#%d wait go(%p) == %u\n" , gtid, |
1159 | &thr_bar->b_go, KMP_BARRIER_STATE_BUMP)); |
1160 | // Wait for parent thread to release us |
1161 | kmp_flag_64<> flag(&thr_bar->b_go, KMP_BARRIER_STATE_BUMP); |
1162 | flag.wait(this_thr, TRUE USE_ITT_BUILD_ARG(itt_sync_obj)); |
1163 | #if USE_ITT_BUILD && USE_ITT_NOTIFY |
1164 | if ((__itt_sync_create_ptr && itt_sync_obj == NULL) || KMP_ITT_DEBUG) { |
1165 | // In fork barrier where we could not get the object reliably |
1166 | itt_sync_obj = __kmp_itt_barrier_object(gtid, bt: bs_forkjoin_barrier, set_name: 0, delta: -1); |
1167 | // Cancel wait on previous parallel region... |
1168 | __kmp_itt_task_starting(object: itt_sync_obj); |
1169 | |
1170 | if (bt == bs_forkjoin_barrier && TCR_4(__kmp_global.g.g_done)) |
1171 | return; |
1172 | |
1173 | itt_sync_obj = __kmp_itt_barrier_object(gtid, bt: bs_forkjoin_barrier); |
1174 | if (itt_sync_obj != NULL) |
1175 | // Call prepare as early as possible for "new" barrier |
1176 | __kmp_itt_task_finished(object: itt_sync_obj); |
1177 | } else |
1178 | #endif /* USE_ITT_BUILD && USE_ITT_NOTIFY */ |
1179 | // Early exit for reaping threads releasing forkjoin barrier |
1180 | if (bt == bs_forkjoin_barrier && TCR_4(__kmp_global.g.g_done)) |
1181 | return; |
1182 | |
1183 | // The worker thread may now assume that the team is valid. |
1184 | team = __kmp_threads[gtid]->th.th_team; |
1185 | KMP_DEBUG_ASSERT(team != NULL); |
1186 | tid = __kmp_tid_from_gtid(gtid); |
1187 | |
1188 | TCW_4(thr_bar->b_go, KMP_INIT_BARRIER_STATE); |
1189 | KA_TRACE(20, |
1190 | ("__kmp_hyper_barrier_release: T#%d(%d:%d) set go(%p) = %u\n" , |
1191 | gtid, team->t.t_id, tid, &thr_bar->b_go, KMP_INIT_BARRIER_STATE)); |
1192 | KMP_MB(); // Flush all pending memory write invalidates. |
1193 | } |
1194 | num_threads = this_thr->th.th_team_nproc; |
1195 | other_threads = team->t.t_threads; |
1196 | |
1197 | #ifdef KMP_REVERSE_HYPER_BAR |
1198 | // Count up to correct level for parent |
1199 | for (level = 0, offset = 1; |
1200 | offset < num_threads && (((tid >> level) & (branch_factor - 1)) == 0); |
1201 | level += branch_bits, offset <<= branch_bits) |
1202 | ; |
1203 | |
1204 | // Now go down from there |
1205 | for (level -= branch_bits, offset >>= branch_bits; offset != 0; |
1206 | level -= branch_bits, offset >>= branch_bits) |
1207 | #else |
1208 | // Go down the tree, level by level |
1209 | for (level = 0, offset = 1; offset < num_threads; |
1210 | level += branch_bits, offset <<= branch_bits) |
1211 | #endif // KMP_REVERSE_HYPER_BAR |
1212 | { |
1213 | #ifdef KMP_REVERSE_HYPER_BAR |
1214 | /* Now go in reverse order through the children, highest to lowest. |
1215 | Initial setting of child is conservative here. */ |
1216 | child = num_threads >> ((level == 0) ? level : level - 1); |
1217 | for (child = (child < branch_factor - 1) ? child : branch_factor - 1, |
1218 | child_tid = tid + (child << level); |
1219 | child >= 1; child--, child_tid -= (1 << level)) |
1220 | #else |
1221 | if (((tid >> level) & (branch_factor - 1)) != 0) |
1222 | // No need to go lower than this, since this is the level parent would be |
1223 | // notified |
1224 | break; |
1225 | // Iterate through children on this level of the tree |
1226 | for (child = 1, child_tid = tid + (1 << level); |
1227 | child < branch_factor && child_tid < num_threads; |
1228 | child++, child_tid += (1 << level)) |
1229 | #endif // KMP_REVERSE_HYPER_BAR |
1230 | { |
1231 | if (child_tid >= num_threads) |
1232 | continue; // Child doesn't exist so keep going |
1233 | else { |
1234 | kmp_info_t *child_thr = other_threads[child_tid]; |
1235 | kmp_bstate_t *child_bar = &child_thr->th.th_bar[bt].bb; |
1236 | #if KMP_CACHE_MANAGE |
1237 | kmp_uint32 next_child_tid = child_tid - (1 << level); |
1238 | // Prefetch next thread's go count |
1239 | #ifdef KMP_REVERSE_HYPER_BAR |
1240 | if (child - 1 >= 1 && next_child_tid < num_threads) |
1241 | #else |
1242 | if (child + 1 < branch_factor && next_child_tid < num_threads) |
1243 | #endif // KMP_REVERSE_HYPER_BAR |
1244 | KMP_CACHE_PREFETCH( |
1245 | &other_threads[next_child_tid]->th.th_bar[bt].bb.b_go); |
1246 | #endif /* KMP_CACHE_MANAGE */ |
1247 | |
1248 | #if KMP_BARRIER_ICV_PUSH |
1249 | if (propagate_icvs) // push my fixed ICVs to my child |
1250 | copy_icvs(dst: &child_bar->th_fixed_icvs, src: &thr_bar->th_fixed_icvs); |
1251 | #endif // KMP_BARRIER_ICV_PUSH |
1252 | |
1253 | KA_TRACE( |
1254 | 20, |
1255 | ("__kmp_hyper_barrier_release: T#%d(%d:%d) releasing T#%d(%d:%u)" |
1256 | "go(%p): %u => %u\n" , |
1257 | gtid, team->t.t_id, tid, __kmp_gtid_from_tid(child_tid, team), |
1258 | team->t.t_id, child_tid, &child_bar->b_go, child_bar->b_go, |
1259 | child_bar->b_go + KMP_BARRIER_STATE_BUMP)); |
1260 | // Release child from barrier |
1261 | kmp_flag_64<> flag(&child_bar->b_go, child_thr); |
1262 | flag.release(); |
1263 | } |
1264 | } |
1265 | } |
1266 | #if KMP_BARRIER_ICV_PUSH |
1267 | if (propagate_icvs && |
1268 | !KMP_MASTER_TID(tid)) { // copy ICVs locally to final dest |
1269 | __kmp_init_implicit_task(loc_ref: team->t.t_ident, this_thr: team->t.t_threads[tid], team, tid, |
1270 | FALSE); |
1271 | copy_icvs(dst: &team->t.t_implicit_task_taskdata[tid].td_icvs, |
1272 | src: &thr_bar->th_fixed_icvs); |
1273 | } |
1274 | #endif |
1275 | KA_TRACE( |
1276 | 20, |
1277 | ("__kmp_hyper_barrier_release: T#%d(%d:%d) exit for barrier type %d\n" , |
1278 | gtid, team->t.t_id, tid, bt)); |
1279 | } |
1280 | |
1281 | // Hierarchical Barrier |
1282 | |
1283 | // Initialize thread barrier data |
1284 | /* Initializes/re-initializes the hierarchical barrier data stored on a thread. |
1285 | Performs the minimum amount of initialization required based on how the team |
1286 | has changed. Returns true if leaf children will require both on-core and |
1287 | traditional wake-up mechanisms. For example, if the team size increases, |
1288 | threads already in the team will respond to on-core wakeup on their parent |
1289 | thread, but threads newly added to the team will only be listening on the |
1290 | their local b_go. */ |
1291 | static bool __kmp_init_hierarchical_barrier_thread(enum barrier_type bt, |
1292 | kmp_bstate_t *thr_bar, |
1293 | kmp_uint32 nproc, int gtid, |
1294 | int tid, kmp_team_t *team) { |
1295 | // Checks to determine if (re-)initialization is needed |
1296 | bool uninitialized = thr_bar->team == NULL; |
1297 | bool team_changed = team != thr_bar->team; |
1298 | bool team_sz_changed = nproc != thr_bar->nproc; |
1299 | bool tid_changed = tid != thr_bar->old_tid; |
1300 | bool retval = false; |
1301 | |
1302 | if (uninitialized || team_sz_changed) { |
1303 | __kmp_get_hierarchy(nproc, thr_bar); |
1304 | } |
1305 | |
1306 | if (uninitialized || team_sz_changed || tid_changed) { |
1307 | thr_bar->my_level = thr_bar->depth - 1; // default for primary thread |
1308 | thr_bar->parent_tid = -1; // default for primary thread |
1309 | if (!KMP_MASTER_TID(tid)) { |
1310 | // if not primary thread, find parent thread in hierarchy |
1311 | kmp_uint32 d = 0; |
1312 | while (d < thr_bar->depth) { // find parent based on level of thread in |
1313 | // hierarchy, and note level |
1314 | kmp_uint32 rem; |
1315 | if (d == thr_bar->depth - 2) { // reached level right below the primary |
1316 | thr_bar->parent_tid = 0; |
1317 | thr_bar->my_level = d; |
1318 | break; |
1319 | } else if ((rem = tid % thr_bar->skip_per_level[d + 1]) != 0) { |
1320 | // TODO: can we make the above op faster? |
1321 | // thread is not a subtree root at next level, so this is max |
1322 | thr_bar->parent_tid = tid - rem; |
1323 | thr_bar->my_level = d; |
1324 | break; |
1325 | } |
1326 | ++d; |
1327 | } |
1328 | } |
1329 | __kmp_type_convert(src: 7 - ((tid - thr_bar->parent_tid) / |
1330 | (thr_bar->skip_per_level[thr_bar->my_level])), |
1331 | dest: &(thr_bar->offset)); |
1332 | thr_bar->old_tid = tid; |
1333 | thr_bar->wait_flag = KMP_BARRIER_NOT_WAITING; |
1334 | thr_bar->team = team; |
1335 | thr_bar->parent_bar = |
1336 | &team->t.t_threads[thr_bar->parent_tid]->th.th_bar[bt].bb; |
1337 | } |
1338 | if (uninitialized || team_changed || tid_changed) { |
1339 | thr_bar->team = team; |
1340 | thr_bar->parent_bar = |
1341 | &team->t.t_threads[thr_bar->parent_tid]->th.th_bar[bt].bb; |
1342 | retval = true; |
1343 | } |
1344 | if (uninitialized || team_sz_changed || tid_changed) { |
1345 | thr_bar->nproc = nproc; |
1346 | thr_bar->leaf_kids = thr_bar->base_leaf_kids; |
1347 | if (thr_bar->my_level == 0) |
1348 | thr_bar->leaf_kids = 0; |
1349 | if (thr_bar->leaf_kids && (kmp_uint32)tid + thr_bar->leaf_kids + 1 > nproc) |
1350 | __kmp_type_convert(src: nproc - tid - 1, dest: &(thr_bar->leaf_kids)); |
1351 | thr_bar->leaf_state = 0; |
1352 | for (int i = 0; i < thr_bar->leaf_kids; ++i) |
1353 | ((char *)&(thr_bar->leaf_state))[7 - i] = 1; |
1354 | } |
1355 | return retval; |
1356 | } |
1357 | |
1358 | static void __kmp_hierarchical_barrier_gather( |
1359 | enum barrier_type bt, kmp_info_t *this_thr, int gtid, int tid, |
1360 | void (*reduce)(void *, void *) USE_ITT_BUILD_ARG(void *itt_sync_obj)) { |
1361 | KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_hier_gather); |
1362 | kmp_team_t *team = this_thr->th.th_team; |
1363 | kmp_bstate_t *thr_bar = &this_thr->th.th_bar[bt].bb; |
1364 | kmp_uint32 nproc = this_thr->th.th_team_nproc; |
1365 | kmp_info_t **other_threads = team->t.t_threads; |
1366 | kmp_uint64 new_state = 0; |
1367 | |
1368 | int level = team->t.t_level; |
1369 | if (other_threads[0] |
1370 | ->th.th_teams_microtask) // are we inside the teams construct? |
1371 | if (this_thr->th.th_teams_size.nteams > 1) |
1372 | ++level; // level was not increased in teams construct for team_of_masters |
1373 | if (level == 1) |
1374 | thr_bar->use_oncore_barrier = 1; |
1375 | else |
1376 | thr_bar->use_oncore_barrier = 0; // Do not use oncore barrier when nested |
1377 | |
1378 | KA_TRACE(20, ("__kmp_hierarchical_barrier_gather: T#%d(%d:%d) enter for " |
1379 | "barrier type %d\n" , |
1380 | gtid, team->t.t_id, tid, bt)); |
1381 | KMP_DEBUG_ASSERT(this_thr == other_threads[this_thr->th.th_info.ds.ds_tid]); |
1382 | |
1383 | #if USE_ITT_BUILD && USE_ITT_NOTIFY |
1384 | // Barrier imbalance - save arrive time to the thread |
1385 | if (__kmp_forkjoin_frames_mode == 3 || __kmp_forkjoin_frames_mode == 2) { |
1386 | this_thr->th.th_bar_arrive_time = __itt_get_timestamp(); |
1387 | } |
1388 | #endif |
1389 | |
1390 | (void)__kmp_init_hierarchical_barrier_thread(bt, thr_bar, nproc, gtid, tid, |
1391 | team); |
1392 | |
1393 | if (thr_bar->my_level) { // not a leaf (my_level==0 means leaf) |
1394 | kmp_int32 child_tid; |
1395 | new_state = |
1396 | (kmp_uint64)team->t.t_bar[bt].b_arrived + KMP_BARRIER_STATE_BUMP; |
1397 | if (__kmp_dflt_blocktime == KMP_MAX_BLOCKTIME && |
1398 | thr_bar->use_oncore_barrier) { |
1399 | if (thr_bar->leaf_kids) { |
1400 | // First, wait for leaf children to check-in on my b_arrived flag |
1401 | kmp_uint64 leaf_state = |
1402 | KMP_MASTER_TID(tid) |
1403 | ? thr_bar->b_arrived | thr_bar->leaf_state |
1404 | : team->t.t_bar[bt].b_arrived | thr_bar->leaf_state; |
1405 | KA_TRACE(20, ("__kmp_hierarchical_barrier_gather: T#%d(%d:%d) waiting " |
1406 | "for leaf kids\n" , |
1407 | gtid, team->t.t_id, tid)); |
1408 | kmp_flag_64<> flag(&thr_bar->b_arrived, leaf_state); |
1409 | flag.wait(this_thr, FALSE USE_ITT_BUILD_ARG(itt_sync_obj)); |
1410 | if (reduce) { |
1411 | OMPT_REDUCTION_DECL(this_thr, gtid); |
1412 | OMPT_REDUCTION_BEGIN; |
1413 | for (child_tid = tid + 1; child_tid <= tid + thr_bar->leaf_kids; |
1414 | ++child_tid) { |
1415 | KA_TRACE(100, ("__kmp_hierarchical_barrier_gather: T#%d(%d:%d) += " |
1416 | "T#%d(%d:%d)\n" , |
1417 | gtid, team->t.t_id, tid, |
1418 | __kmp_gtid_from_tid(child_tid, team), team->t.t_id, |
1419 | child_tid)); |
1420 | (*reduce)(this_thr->th.th_local.reduce_data, |
1421 | other_threads[child_tid]->th.th_local.reduce_data); |
1422 | } |
1423 | OMPT_REDUCTION_END; |
1424 | } |
1425 | // clear leaf_state bits |
1426 | KMP_TEST_THEN_AND64(&thr_bar->b_arrived, ~(thr_bar->leaf_state)); |
1427 | } |
1428 | // Next, wait for higher level children on each child's b_arrived flag |
1429 | for (kmp_uint32 d = 1; d < thr_bar->my_level; |
1430 | ++d) { // gather lowest level threads first, but skip 0 |
1431 | kmp_uint32 last = tid + thr_bar->skip_per_level[d + 1], |
1432 | skip = thr_bar->skip_per_level[d]; |
1433 | if (last > nproc) |
1434 | last = nproc; |
1435 | for (child_tid = tid + skip; child_tid < (int)last; child_tid += skip) { |
1436 | kmp_info_t *child_thr = other_threads[child_tid]; |
1437 | kmp_bstate_t *child_bar = &child_thr->th.th_bar[bt].bb; |
1438 | KA_TRACE(20, ("__kmp_hierarchical_barrier_gather: T#%d(%d:%d) wait " |
1439 | "T#%d(%d:%d) " |
1440 | "arrived(%p) == %llu\n" , |
1441 | gtid, team->t.t_id, tid, |
1442 | __kmp_gtid_from_tid(child_tid, team), team->t.t_id, |
1443 | child_tid, &child_bar->b_arrived, new_state)); |
1444 | kmp_flag_64<> flag(&child_bar->b_arrived, new_state); |
1445 | flag.wait(this_thr, FALSE USE_ITT_BUILD_ARG(itt_sync_obj)); |
1446 | if (reduce) { |
1447 | KA_TRACE(100, ("__kmp_hierarchical_barrier_gather: T#%d(%d:%d) += " |
1448 | "T#%d(%d:%d)\n" , |
1449 | gtid, team->t.t_id, tid, |
1450 | __kmp_gtid_from_tid(child_tid, team), team->t.t_id, |
1451 | child_tid)); |
1452 | (*reduce)(this_thr->th.th_local.reduce_data, |
1453 | child_thr->th.th_local.reduce_data); |
1454 | } |
1455 | } |
1456 | } |
1457 | } else { // Blocktime is not infinite |
1458 | for (kmp_uint32 d = 0; d < thr_bar->my_level; |
1459 | ++d) { // Gather lowest level threads first |
1460 | kmp_uint32 last = tid + thr_bar->skip_per_level[d + 1], |
1461 | skip = thr_bar->skip_per_level[d]; |
1462 | if (last > nproc) |
1463 | last = nproc; |
1464 | for (child_tid = tid + skip; child_tid < (int)last; child_tid += skip) { |
1465 | kmp_info_t *child_thr = other_threads[child_tid]; |
1466 | kmp_bstate_t *child_bar = &child_thr->th.th_bar[bt].bb; |
1467 | KA_TRACE(20, ("__kmp_hierarchical_barrier_gather: T#%d(%d:%d) wait " |
1468 | "T#%d(%d:%d) " |
1469 | "arrived(%p) == %llu\n" , |
1470 | gtid, team->t.t_id, tid, |
1471 | __kmp_gtid_from_tid(child_tid, team), team->t.t_id, |
1472 | child_tid, &child_bar->b_arrived, new_state)); |
1473 | kmp_flag_64<> flag(&child_bar->b_arrived, new_state); |
1474 | flag.wait(this_thr, FALSE USE_ITT_BUILD_ARG(itt_sync_obj)); |
1475 | if (reduce) { |
1476 | KA_TRACE(100, ("__kmp_hierarchical_barrier_gather: T#%d(%d:%d) += " |
1477 | "T#%d(%d:%d)\n" , |
1478 | gtid, team->t.t_id, tid, |
1479 | __kmp_gtid_from_tid(child_tid, team), team->t.t_id, |
1480 | child_tid)); |
1481 | (*reduce)(this_thr->th.th_local.reduce_data, |
1482 | child_thr->th.th_local.reduce_data); |
1483 | } |
1484 | } |
1485 | } |
1486 | } |
1487 | } |
1488 | // All subordinates are gathered; now release parent if not primary thread |
1489 | |
1490 | if (!KMP_MASTER_TID(tid)) { // worker threads release parent in hierarchy |
1491 | KA_TRACE(20, ("__kmp_hierarchical_barrier_gather: T#%d(%d:%d) releasing" |
1492 | " T#%d(%d:%d) arrived(%p): %llu => %llu\n" , |
1493 | gtid, team->t.t_id, tid, |
1494 | __kmp_gtid_from_tid(thr_bar->parent_tid, team), team->t.t_id, |
1495 | thr_bar->parent_tid, &thr_bar->b_arrived, thr_bar->b_arrived, |
1496 | thr_bar->b_arrived + KMP_BARRIER_STATE_BUMP)); |
1497 | /* Mark arrival to parent: After performing this write, a worker thread may |
1498 | not assume that the team is valid any more - it could be deallocated by |
1499 | the primary thread at any time. */ |
1500 | if (thr_bar->my_level || __kmp_dflt_blocktime != KMP_MAX_BLOCKTIME || |
1501 | !thr_bar->use_oncore_barrier) { // Parent is waiting on my b_arrived |
1502 | // flag; release it |
1503 | kmp_flag_64<> flag(&thr_bar->b_arrived, |
1504 | other_threads[thr_bar->parent_tid]); |
1505 | flag.release(); |
1506 | } else { |
1507 | // Leaf does special release on "offset" bits of parent's b_arrived flag |
1508 | thr_bar->b_arrived = team->t.t_bar[bt].b_arrived + KMP_BARRIER_STATE_BUMP; |
1509 | kmp_flag_oncore flag(&thr_bar->parent_bar->b_arrived, |
1510 | thr_bar->offset + 1); |
1511 | flag.set_waiter(other_threads[thr_bar->parent_tid]); |
1512 | flag.release(); |
1513 | } |
1514 | } else { // Primary thread needs to update the team's b_arrived value |
1515 | team->t.t_bar[bt].b_arrived = new_state; |
1516 | KA_TRACE(20, ("__kmp_hierarchical_barrier_gather: T#%d(%d:%d) set team %d " |
1517 | "arrived(%p) = %llu\n" , |
1518 | gtid, team->t.t_id, tid, team->t.t_id, |
1519 | &team->t.t_bar[bt].b_arrived, team->t.t_bar[bt].b_arrived)); |
1520 | } |
1521 | // Is the team access below unsafe or just technically invalid? |
1522 | KA_TRACE(20, ("__kmp_hierarchical_barrier_gather: T#%d(%d:%d) exit for " |
1523 | "barrier type %d\n" , |
1524 | gtid, team->t.t_id, tid, bt)); |
1525 | } |
1526 | |
1527 | static void __kmp_hierarchical_barrier_release( |
1528 | enum barrier_type bt, kmp_info_t *this_thr, int gtid, int tid, |
1529 | int propagate_icvs USE_ITT_BUILD_ARG(void *itt_sync_obj)) { |
1530 | KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_hier_release); |
1531 | kmp_team_t *team; |
1532 | kmp_bstate_t *thr_bar = &this_thr->th.th_bar[bt].bb; |
1533 | kmp_uint32 nproc; |
1534 | bool team_change = false; // indicates on-core barrier shouldn't be used |
1535 | |
1536 | if (KMP_MASTER_TID(tid)) { |
1537 | team = __kmp_threads[gtid]->th.th_team; |
1538 | KMP_DEBUG_ASSERT(team != NULL); |
1539 | KA_TRACE(20, ("__kmp_hierarchical_barrier_release: T#%d(%d:%d) primary " |
1540 | "entered barrier type %d\n" , |
1541 | gtid, team->t.t_id, tid, bt)); |
1542 | } else { // Worker threads |
1543 | // Wait for parent thread to release me |
1544 | if (!thr_bar->use_oncore_barrier || |
1545 | __kmp_dflt_blocktime != KMP_MAX_BLOCKTIME || thr_bar->my_level != 0 || |
1546 | thr_bar->team == NULL) { |
1547 | // Use traditional method of waiting on my own b_go flag |
1548 | thr_bar->wait_flag = KMP_BARRIER_OWN_FLAG; |
1549 | kmp_flag_64<> flag(&thr_bar->b_go, KMP_BARRIER_STATE_BUMP); |
1550 | flag.wait(this_thr, TRUE USE_ITT_BUILD_ARG(itt_sync_obj)); |
1551 | TCW_8(thr_bar->b_go, |
1552 | KMP_INIT_BARRIER_STATE); // Reset my b_go flag for next time |
1553 | } else { // Thread barrier data is initialized, this is a leaf, blocktime is |
1554 | // infinite, not nested |
1555 | // Wait on my "offset" bits on parent's b_go flag |
1556 | thr_bar->wait_flag = KMP_BARRIER_PARENT_FLAG; |
1557 | kmp_flag_oncore flag(&thr_bar->parent_bar->b_go, KMP_BARRIER_STATE_BUMP, |
1558 | thr_bar->offset + 1, bt, |
1559 | this_thr USE_ITT_BUILD_ARG(itt_sync_obj)); |
1560 | flag.wait(this_thr, TRUE); |
1561 | if (thr_bar->wait_flag == |
1562 | KMP_BARRIER_SWITCHING) { // Thread was switched to own b_go |
1563 | TCW_8(thr_bar->b_go, |
1564 | KMP_INIT_BARRIER_STATE); // Reset my b_go flag for next time |
1565 | } else { // Reset my bits on parent's b_go flag |
1566 | (RCAST(volatile char *, |
1567 | &(thr_bar->parent_bar->b_go)))[thr_bar->offset + 1] = 0; |
1568 | } |
1569 | } |
1570 | thr_bar->wait_flag = KMP_BARRIER_NOT_WAITING; |
1571 | // Early exit for reaping threads releasing forkjoin barrier |
1572 | if (bt == bs_forkjoin_barrier && TCR_4(__kmp_global.g.g_done)) |
1573 | return; |
1574 | // The worker thread may now assume that the team is valid. |
1575 | team = __kmp_threads[gtid]->th.th_team; |
1576 | KMP_DEBUG_ASSERT(team != NULL); |
1577 | tid = __kmp_tid_from_gtid(gtid); |
1578 | |
1579 | KA_TRACE( |
1580 | 20, |
1581 | ("__kmp_hierarchical_barrier_release: T#%d(%d:%d) set go(%p) = %u\n" , |
1582 | gtid, team->t.t_id, tid, &thr_bar->b_go, KMP_INIT_BARRIER_STATE)); |
1583 | KMP_MB(); // Flush all pending memory write invalidates. |
1584 | } |
1585 | |
1586 | nproc = this_thr->th.th_team_nproc; |
1587 | int level = team->t.t_level; |
1588 | if (team->t.t_threads[0] |
1589 | ->th.th_teams_microtask) { // are we inside the teams construct? |
1590 | if (team->t.t_pkfn != (microtask_t)__kmp_teams_master && |
1591 | this_thr->th.th_teams_level == level) |
1592 | ++level; // level was not increased in teams construct for team_of_workers |
1593 | if (this_thr->th.th_teams_size.nteams > 1) |
1594 | ++level; // level was not increased in teams construct for team_of_masters |
1595 | } |
1596 | if (level == 1) |
1597 | thr_bar->use_oncore_barrier = 1; |
1598 | else |
1599 | thr_bar->use_oncore_barrier = 0; // Do not use oncore barrier when nested |
1600 | |
1601 | // If the team size has increased, we still communicate with old leaves via |
1602 | // oncore barrier. |
1603 | unsigned short int old_leaf_kids = thr_bar->leaf_kids; |
1604 | kmp_uint64 old_leaf_state = thr_bar->leaf_state; |
1605 | team_change = __kmp_init_hierarchical_barrier_thread(bt, thr_bar, nproc, gtid, |
1606 | tid, team); |
1607 | // But if the entire team changes, we won't use oncore barrier at all |
1608 | if (team_change) |
1609 | old_leaf_kids = 0; |
1610 | |
1611 | #if KMP_BARRIER_ICV_PUSH |
1612 | if (propagate_icvs) { |
1613 | __kmp_init_implicit_task(loc_ref: team->t.t_ident, this_thr: team->t.t_threads[tid], team, tid, |
1614 | FALSE); |
1615 | if (KMP_MASTER_TID( |
1616 | tid)) { // primary already has copy in final destination; copy |
1617 | copy_icvs(dst: &thr_bar->th_fixed_icvs, |
1618 | src: &team->t.t_implicit_task_taskdata[tid].td_icvs); |
1619 | } else if (__kmp_dflt_blocktime == KMP_MAX_BLOCKTIME && |
1620 | thr_bar->use_oncore_barrier) { // optimization for inf blocktime |
1621 | if (!thr_bar->my_level) // I'm a leaf in the hierarchy (my_level==0) |
1622 | // leaves (on-core children) pull parent's fixed ICVs directly to local |
1623 | // ICV store |
1624 | copy_icvs(dst: &team->t.t_implicit_task_taskdata[tid].td_icvs, |
1625 | src: &thr_bar->parent_bar->th_fixed_icvs); |
1626 | // non-leaves will get ICVs piggybacked with b_go via NGO store |
1627 | } else { // blocktime is not infinite; pull ICVs from parent's fixed ICVs |
1628 | if (thr_bar->my_level) // not a leaf; copy ICVs to my fixed ICVs child can |
1629 | // access |
1630 | copy_icvs(dst: &thr_bar->th_fixed_icvs, src: &thr_bar->parent_bar->th_fixed_icvs); |
1631 | else // leaves copy parent's fixed ICVs directly to local ICV store |
1632 | copy_icvs(dst: &team->t.t_implicit_task_taskdata[tid].td_icvs, |
1633 | src: &thr_bar->parent_bar->th_fixed_icvs); |
1634 | } |
1635 | } |
1636 | #endif // KMP_BARRIER_ICV_PUSH |
1637 | |
1638 | // Now, release my children |
1639 | if (thr_bar->my_level) { // not a leaf |
1640 | kmp_int32 child_tid; |
1641 | kmp_uint32 last; |
1642 | if (__kmp_dflt_blocktime == KMP_MAX_BLOCKTIME && |
1643 | thr_bar->use_oncore_barrier) { |
1644 | if (KMP_MASTER_TID(tid)) { // do a flat release |
1645 | // Set local b_go to bump children via NGO store of the cache line |
1646 | // containing IVCs and b_go. |
1647 | thr_bar->b_go = KMP_BARRIER_STATE_BUMP; |
1648 | // Use ngo stores if available; b_go piggybacks in the last 8 bytes of |
1649 | // the cache line |
1650 | ngo_load(&thr_bar->th_fixed_icvs); |
1651 | // This loops over all the threads skipping only the leaf nodes in the |
1652 | // hierarchy |
1653 | for (child_tid = thr_bar->skip_per_level[1]; child_tid < (int)nproc; |
1654 | child_tid += thr_bar->skip_per_level[1]) { |
1655 | kmp_bstate_t *child_bar = |
1656 | &team->t.t_threads[child_tid]->th.th_bar[bt].bb; |
1657 | KA_TRACE(20, ("__kmp_hierarchical_barrier_release: T#%d(%d:%d) " |
1658 | "releasing T#%d(%d:%d)" |
1659 | " go(%p): %u => %u\n" , |
1660 | gtid, team->t.t_id, tid, |
1661 | __kmp_gtid_from_tid(child_tid, team), team->t.t_id, |
1662 | child_tid, &child_bar->b_go, child_bar->b_go, |
1663 | child_bar->b_go + KMP_BARRIER_STATE_BUMP)); |
1664 | // Use ngo store (if available) to both store ICVs and release child |
1665 | // via child's b_go |
1666 | ngo_store_go(&child_bar->th_fixed_icvs, &thr_bar->th_fixed_icvs); |
1667 | } |
1668 | ngo_sync(); |
1669 | } |
1670 | TCW_8(thr_bar->b_go, |
1671 | KMP_INIT_BARRIER_STATE); // Reset my b_go flag for next time |
1672 | // Now, release leaf children |
1673 | if (thr_bar->leaf_kids) { // if there are any |
1674 | // We test team_change on the off-chance that the level 1 team changed. |
1675 | if (team_change || |
1676 | old_leaf_kids < thr_bar->leaf_kids) { // some old, some new |
1677 | if (old_leaf_kids) { // release old leaf kids |
1678 | thr_bar->b_go |= old_leaf_state; |
1679 | } |
1680 | // Release new leaf kids |
1681 | last = tid + thr_bar->skip_per_level[1]; |
1682 | if (last > nproc) |
1683 | last = nproc; |
1684 | for (child_tid = tid + 1 + old_leaf_kids; child_tid < (int)last; |
1685 | ++child_tid) { // skip_per_level[0]=1 |
1686 | kmp_info_t *child_thr = team->t.t_threads[child_tid]; |
1687 | kmp_bstate_t *child_bar = &child_thr->th.th_bar[bt].bb; |
1688 | KA_TRACE( |
1689 | 20, |
1690 | ("__kmp_hierarchical_barrier_release: T#%d(%d:%d) releasing" |
1691 | " T#%d(%d:%d) go(%p): %u => %u\n" , |
1692 | gtid, team->t.t_id, tid, __kmp_gtid_from_tid(child_tid, team), |
1693 | team->t.t_id, child_tid, &child_bar->b_go, child_bar->b_go, |
1694 | child_bar->b_go + KMP_BARRIER_STATE_BUMP)); |
1695 | // Release child using child's b_go flag |
1696 | kmp_flag_64<> flag(&child_bar->b_go, child_thr); |
1697 | flag.release(); |
1698 | } |
1699 | } else { // Release all children at once with leaf_state bits on my own |
1700 | // b_go flag |
1701 | thr_bar->b_go |= thr_bar->leaf_state; |
1702 | } |
1703 | } |
1704 | } else { // Blocktime is not infinite; do a simple hierarchical release |
1705 | for (int d = thr_bar->my_level - 1; d >= 0; |
1706 | --d) { // Release highest level threads first |
1707 | last = tid + thr_bar->skip_per_level[d + 1]; |
1708 | kmp_uint32 skip = thr_bar->skip_per_level[d]; |
1709 | if (last > nproc) |
1710 | last = nproc; |
1711 | for (child_tid = tid + skip; child_tid < (int)last; child_tid += skip) { |
1712 | kmp_info_t *child_thr = team->t.t_threads[child_tid]; |
1713 | kmp_bstate_t *child_bar = &child_thr->th.th_bar[bt].bb; |
1714 | KA_TRACE(20, ("__kmp_hierarchical_barrier_release: T#%d(%d:%d) " |
1715 | "releasing T#%d(%d:%d) go(%p): %u => %u\n" , |
1716 | gtid, team->t.t_id, tid, |
1717 | __kmp_gtid_from_tid(child_tid, team), team->t.t_id, |
1718 | child_tid, &child_bar->b_go, child_bar->b_go, |
1719 | child_bar->b_go + KMP_BARRIER_STATE_BUMP)); |
1720 | // Release child using child's b_go flag |
1721 | kmp_flag_64<> flag(&child_bar->b_go, child_thr); |
1722 | flag.release(); |
1723 | } |
1724 | } |
1725 | } |
1726 | #if KMP_BARRIER_ICV_PUSH |
1727 | if (propagate_icvs && !KMP_MASTER_TID(tid)) |
1728 | // non-leaves copy ICVs from fixed ICVs to local dest |
1729 | copy_icvs(dst: &team->t.t_implicit_task_taskdata[tid].td_icvs, |
1730 | src: &thr_bar->th_fixed_icvs); |
1731 | #endif // KMP_BARRIER_ICV_PUSH |
1732 | } |
1733 | KA_TRACE(20, ("__kmp_hierarchical_barrier_release: T#%d(%d:%d) exit for " |
1734 | "barrier type %d\n" , |
1735 | gtid, team->t.t_id, tid, bt)); |
1736 | } |
1737 | |
1738 | // End of Barrier Algorithms |
1739 | |
1740 | // type traits for cancellable value |
1741 | // if cancellable is true, then is_cancellable is a normal boolean variable |
1742 | // if cancellable is false, then is_cancellable is a compile time constant |
1743 | template <bool cancellable> struct is_cancellable {}; |
1744 | template <> struct is_cancellable<true> { |
1745 | bool value; |
1746 | is_cancellable() : value(false) {} |
1747 | is_cancellable(bool b) : value(b) {} |
1748 | is_cancellable &operator=(bool b) { |
1749 | value = b; |
1750 | return *this; |
1751 | } |
1752 | operator bool() const { return value; } |
1753 | }; |
1754 | template <> struct is_cancellable<false> { |
1755 | is_cancellable &operator=(bool b) { return *this; } |
1756 | constexpr operator bool() const { return false; } |
1757 | }; |
1758 | |
1759 | // Internal function to do a barrier. |
1760 | /* If is_split is true, do a split barrier, otherwise, do a plain barrier |
1761 | If reduce is non-NULL, do a split reduction barrier, otherwise, do a split |
1762 | barrier |
1763 | When cancellable = false, |
1764 | Returns 0 if primary thread, 1 if worker thread. |
1765 | When cancellable = true |
1766 | Returns 0 if not cancelled, 1 if cancelled. */ |
1767 | template <bool cancellable = false> |
1768 | static int __kmp_barrier_template(enum barrier_type bt, int gtid, int is_split, |
1769 | size_t reduce_size, void *reduce_data, |
1770 | void (*reduce)(void *, void *)) { |
1771 | KMP_TIME_PARTITIONED_BLOCK(OMP_plain_barrier); |
1772 | KMP_SET_THREAD_STATE_BLOCK(PLAIN_BARRIER); |
1773 | int tid = __kmp_tid_from_gtid(gtid); |
1774 | kmp_info_t *this_thr = __kmp_threads[gtid]; |
1775 | kmp_team_t *team = this_thr->th.th_team; |
1776 | int status = 0; |
1777 | is_cancellable<cancellable> cancelled; |
1778 | #if OMPT_SUPPORT && OMPT_OPTIONAL |
1779 | ompt_data_t *my_task_data; |
1780 | ompt_data_t *my_parallel_data; |
1781 | void *return_address; |
1782 | ompt_sync_region_t barrier_kind; |
1783 | #endif |
1784 | |
1785 | KA_TRACE(15, ("__kmp_barrier: T#%d(%d:%d) has arrived\n" , gtid, |
1786 | __kmp_team_from_gtid(gtid)->t.t_id, __kmp_tid_from_gtid(gtid))); |
1787 | |
1788 | #if OMPT_SUPPORT |
1789 | if (ompt_enabled.enabled) { |
1790 | #if OMPT_OPTIONAL |
1791 | my_task_data = OMPT_CUR_TASK_DATA(this_thr); |
1792 | my_parallel_data = OMPT_CUR_TEAM_DATA(this_thr); |
1793 | return_address = OMPT_LOAD_RETURN_ADDRESS(gtid); |
1794 | barrier_kind = __ompt_get_barrier_kind(bt, this_thr); |
1795 | if (ompt_enabled.ompt_callback_sync_region) { |
1796 | ompt_callbacks.ompt_callback(ompt_callback_sync_region)( |
1797 | barrier_kind, ompt_scope_begin, my_parallel_data, my_task_data, |
1798 | return_address); |
1799 | } |
1800 | if (ompt_enabled.ompt_callback_sync_region_wait) { |
1801 | ompt_callbacks.ompt_callback(ompt_callback_sync_region_wait)( |
1802 | barrier_kind, ompt_scope_begin, my_parallel_data, my_task_data, |
1803 | return_address); |
1804 | } |
1805 | #endif |
1806 | // It is OK to report the barrier state after the barrier begin callback. |
1807 | // According to the OMPT specification, a compliant implementation may |
1808 | // even delay reporting this state until the barrier begins to wait. |
1809 | auto *ompt_thr_info = &this_thr->th.ompt_thread_info; |
1810 | switch (barrier_kind) { |
1811 | case ompt_sync_region_barrier_explicit: |
1812 | ompt_thr_info->state = ompt_state_wait_barrier_explicit; |
1813 | break; |
1814 | case ompt_sync_region_barrier_implicit_workshare: |
1815 | ompt_thr_info->state = ompt_state_wait_barrier_implicit_workshare; |
1816 | break; |
1817 | case ompt_sync_region_barrier_implicit_parallel: |
1818 | ompt_thr_info->state = ompt_state_wait_barrier_implicit_parallel; |
1819 | break; |
1820 | case ompt_sync_region_barrier_teams: |
1821 | ompt_thr_info->state = ompt_state_wait_barrier_teams; |
1822 | break; |
1823 | case ompt_sync_region_barrier_implementation: |
1824 | [[fallthrough]]; |
1825 | default: |
1826 | ompt_thr_info->state = ompt_state_wait_barrier_implementation; |
1827 | } |
1828 | } |
1829 | #endif |
1830 | |
1831 | if (!team->t.t_serialized) { |
1832 | #if USE_ITT_BUILD |
1833 | // This value will be used in itt notify events below. |
1834 | void *itt_sync_obj = NULL; |
1835 | #if USE_ITT_NOTIFY |
1836 | if (__itt_sync_create_ptr || KMP_ITT_DEBUG) |
1837 | itt_sync_obj = __kmp_itt_barrier_object(gtid, bt, set_name: 1); |
1838 | #endif |
1839 | #endif /* USE_ITT_BUILD */ |
1840 | if (__kmp_tasking_mode == tskm_extra_barrier) { |
1841 | __kmp_tasking_barrier(team, thread: this_thr, gtid); |
1842 | KA_TRACE(15, |
1843 | ("__kmp_barrier: T#%d(%d:%d) past tasking barrier\n" , gtid, |
1844 | __kmp_team_from_gtid(gtid)->t.t_id, __kmp_tid_from_gtid(gtid))); |
1845 | } |
1846 | |
1847 | /* Copy the blocktime info to the thread, where __kmp_wait_template() can |
1848 | access it when the team struct is not guaranteed to exist. */ |
1849 | // See note about the corresponding code in __kmp_join_barrier() being |
1850 | // performance-critical. |
1851 | if (__kmp_dflt_blocktime != KMP_MAX_BLOCKTIME) { |
1852 | #if KMP_USE_MONITOR |
1853 | this_thr->th.th_team_bt_intervals = |
1854 | team->t.t_implicit_task_taskdata[tid].td_icvs.bt_intervals; |
1855 | this_thr->th.th_team_bt_set = |
1856 | team->t.t_implicit_task_taskdata[tid].td_icvs.bt_set; |
1857 | #else |
1858 | this_thr->th.th_team_bt_intervals = KMP_BLOCKTIME_INTERVAL(team, tid); |
1859 | #endif |
1860 | } |
1861 | |
1862 | #if USE_ITT_BUILD |
1863 | if (__itt_sync_create_ptr || KMP_ITT_DEBUG) |
1864 | __kmp_itt_barrier_starting(gtid, object: itt_sync_obj); |
1865 | #endif /* USE_ITT_BUILD */ |
1866 | #if USE_DEBUGGER |
1867 | // Let the debugger know: the thread arrived to the barrier and waiting. |
1868 | if (KMP_MASTER_TID(tid)) { // Primary thread counter stored in team struct |
1869 | team->t.t_bar[bt].b_master_arrived += 1; |
1870 | } else { |
1871 | this_thr->th.th_bar[bt].bb.b_worker_arrived += 1; |
1872 | } // if |
1873 | #endif /* USE_DEBUGGER */ |
1874 | if (reduce != NULL) { |
1875 | // KMP_DEBUG_ASSERT( is_split == TRUE ); // #C69956 |
1876 | this_thr->th.th_local.reduce_data = reduce_data; |
1877 | } |
1878 | |
1879 | if (KMP_MASTER_TID(tid) && __kmp_tasking_mode != tskm_immediate_exec) |
1880 | __kmp_task_team_setup(this_thr, team); |
1881 | |
1882 | if (cancellable) { |
1883 | cancelled = __kmp_linear_barrier_gather_cancellable( |
1884 | bt, this_thr, gtid, tid, reduce USE_ITT_BUILD_ARG(itt_sync_obj)); |
1885 | } else { |
1886 | switch (__kmp_barrier_gather_pattern[bt]) { |
1887 | case bp_dist_bar: { |
1888 | __kmp_dist_barrier_gather(bt, this_thr, gtid, tid, |
1889 | reduce USE_ITT_BUILD_ARG(itt_sync_obj)); |
1890 | break; |
1891 | } |
1892 | case bp_hyper_bar: { |
1893 | // don't set branch bits to 0; use linear |
1894 | KMP_ASSERT(__kmp_barrier_gather_branch_bits[bt]); |
1895 | __kmp_hyper_barrier_gather(bt, this_thr, gtid, tid, |
1896 | reduce USE_ITT_BUILD_ARG(itt_sync_obj)); |
1897 | break; |
1898 | } |
1899 | case bp_hierarchical_bar: { |
1900 | __kmp_hierarchical_barrier_gather( |
1901 | bt, this_thr, gtid, tid, reduce USE_ITT_BUILD_ARG(itt_sync_obj)); |
1902 | break; |
1903 | } |
1904 | case bp_tree_bar: { |
1905 | // don't set branch bits to 0; use linear |
1906 | KMP_ASSERT(__kmp_barrier_gather_branch_bits[bt]); |
1907 | __kmp_tree_barrier_gather(bt, this_thr, gtid, tid, |
1908 | reduce USE_ITT_BUILD_ARG(itt_sync_obj)); |
1909 | break; |
1910 | } |
1911 | default: { |
1912 | __kmp_linear_barrier_gather(bt, this_thr, gtid, tid, |
1913 | reduce USE_ITT_BUILD_ARG(itt_sync_obj)); |
1914 | } |
1915 | } |
1916 | } |
1917 | |
1918 | KMP_MB(); |
1919 | |
1920 | if (KMP_MASTER_TID(tid)) { |
1921 | status = 0; |
1922 | if (__kmp_tasking_mode != tskm_immediate_exec && !cancelled) { |
1923 | __kmp_task_team_wait(this_thr, team USE_ITT_BUILD_ARG(itt_sync_obj)); |
1924 | } |
1925 | #if USE_DEBUGGER |
1926 | // Let the debugger know: All threads are arrived and starting leaving the |
1927 | // barrier. |
1928 | team->t.t_bar[bt].b_team_arrived += 1; |
1929 | #endif |
1930 | |
1931 | if (__kmp_omp_cancellation) { |
1932 | kmp_int32 cancel_request = KMP_ATOMIC_LD_RLX(&team->t.t_cancel_request); |
1933 | // Reset cancellation flag for worksharing constructs |
1934 | if (cancel_request == cancel_loop || |
1935 | cancel_request == cancel_sections) { |
1936 | KMP_ATOMIC_ST_RLX(&team->t.t_cancel_request, cancel_noreq); |
1937 | } |
1938 | } |
1939 | #if USE_ITT_BUILD |
1940 | /* TODO: In case of split reduction barrier, primary thread may send |
1941 | acquired event early, before the final summation into the shared |
1942 | variable is done (final summation can be a long operation for array |
1943 | reductions). */ |
1944 | if (__itt_sync_create_ptr || KMP_ITT_DEBUG) |
1945 | __kmp_itt_barrier_middle(gtid, object: itt_sync_obj); |
1946 | #endif /* USE_ITT_BUILD */ |
1947 | #if USE_ITT_BUILD && USE_ITT_NOTIFY |
1948 | // Barrier - report frame end (only if active_level == 1) |
1949 | if ((__itt_frame_submit_v3_ptr || KMP_ITT_DEBUG) && |
1950 | __kmp_forkjoin_frames_mode && |
1951 | (this_thr->th.th_teams_microtask == NULL || // either not in teams |
1952 | this_thr->th.th_teams_size.nteams == 1) && // or inside single team |
1953 | team->t.t_active_level == 1) { |
1954 | ident_t *loc = __kmp_threads[gtid]->th.th_ident; |
1955 | kmp_uint64 cur_time = __itt_get_timestamp(); |
1956 | kmp_info_t **other_threads = team->t.t_threads; |
1957 | int nproc = this_thr->th.th_team_nproc; |
1958 | int i; |
1959 | switch (__kmp_forkjoin_frames_mode) { |
1960 | case 1: |
1961 | __kmp_itt_frame_submit(gtid, begin: this_thr->th.th_frame_time, end: cur_time, imbalance: 0, |
1962 | loc, team_size: nproc); |
1963 | this_thr->th.th_frame_time = cur_time; |
1964 | break; |
1965 | case 2: // AC 2015-01-19: currently does not work for hierarchical (to |
1966 | // be fixed) |
1967 | __kmp_itt_frame_submit(gtid, begin: this_thr->th.th_bar_min_time, end: cur_time, |
1968 | imbalance: 1, loc, team_size: nproc); |
1969 | break; |
1970 | case 3: |
1971 | if (__itt_metadata_add_ptr) { |
1972 | // Initialize with primary thread's wait time |
1973 | kmp_uint64 delta = cur_time - this_thr->th.th_bar_arrive_time; |
1974 | // Set arrive time to zero to be able to check it in |
1975 | // __kmp_invoke_task(); the same is done inside the loop below |
1976 | this_thr->th.th_bar_arrive_time = 0; |
1977 | for (i = 1; i < nproc; ++i) { |
1978 | delta += (cur_time - other_threads[i]->th.th_bar_arrive_time); |
1979 | other_threads[i]->th.th_bar_arrive_time = 0; |
1980 | } |
1981 | __kmp_itt_metadata_imbalance(gtid, begin: this_thr->th.th_frame_time, |
1982 | end: cur_time, imbalance: delta, |
1983 | reduction: (kmp_uint64)(reduce != NULL)); |
1984 | } |
1985 | __kmp_itt_frame_submit(gtid, begin: this_thr->th.th_frame_time, end: cur_time, imbalance: 0, |
1986 | loc, team_size: nproc); |
1987 | this_thr->th.th_frame_time = cur_time; |
1988 | break; |
1989 | } |
1990 | } |
1991 | #endif /* USE_ITT_BUILD */ |
1992 | } else { |
1993 | status = 1; |
1994 | #if USE_ITT_BUILD |
1995 | if (__itt_sync_create_ptr || KMP_ITT_DEBUG) |
1996 | __kmp_itt_barrier_middle(gtid, object: itt_sync_obj); |
1997 | #endif /* USE_ITT_BUILD */ |
1998 | } |
1999 | if ((status == 1 || !is_split) && !cancelled) { |
2000 | if (cancellable) { |
2001 | cancelled = __kmp_linear_barrier_release_cancellable( |
2002 | bt, this_thr, gtid, tid, FALSE USE_ITT_BUILD_ARG(itt_sync_obj)); |
2003 | } else { |
2004 | switch (__kmp_barrier_release_pattern[bt]) { |
2005 | case bp_dist_bar: { |
2006 | KMP_ASSERT(__kmp_barrier_release_branch_bits[bt]); |
2007 | __kmp_dist_barrier_release(bt, this_thr, gtid, tid, |
2008 | FALSE USE_ITT_BUILD_ARG(itt_sync_obj)); |
2009 | break; |
2010 | } |
2011 | case bp_hyper_bar: { |
2012 | KMP_ASSERT(__kmp_barrier_release_branch_bits[bt]); |
2013 | __kmp_hyper_barrier_release(bt, this_thr, gtid, tid, |
2014 | FALSE USE_ITT_BUILD_ARG(itt_sync_obj)); |
2015 | break; |
2016 | } |
2017 | case bp_hierarchical_bar: { |
2018 | __kmp_hierarchical_barrier_release( |
2019 | bt, this_thr, gtid, tid, FALSE USE_ITT_BUILD_ARG(itt_sync_obj)); |
2020 | break; |
2021 | } |
2022 | case bp_tree_bar: { |
2023 | KMP_ASSERT(__kmp_barrier_release_branch_bits[bt]); |
2024 | __kmp_tree_barrier_release(bt, this_thr, gtid, tid, |
2025 | FALSE USE_ITT_BUILD_ARG(itt_sync_obj)); |
2026 | break; |
2027 | } |
2028 | default: { |
2029 | __kmp_linear_barrier_release(bt, this_thr, gtid, tid, |
2030 | FALSE USE_ITT_BUILD_ARG(itt_sync_obj)); |
2031 | } |
2032 | } |
2033 | } |
2034 | if (__kmp_tasking_mode != tskm_immediate_exec && !cancelled) { |
2035 | __kmp_task_team_sync(this_thr, team); |
2036 | } |
2037 | } |
2038 | |
2039 | #if USE_ITT_BUILD |
2040 | /* GEH: TODO: Move this under if-condition above and also include in |
2041 | __kmp_end_split_barrier(). This will more accurately represent the actual |
2042 | release time of the threads for split barriers. */ |
2043 | if (__itt_sync_create_ptr || KMP_ITT_DEBUG) |
2044 | __kmp_itt_barrier_finished(gtid, object: itt_sync_obj); |
2045 | #endif /* USE_ITT_BUILD */ |
2046 | } else { // Team is serialized. |
2047 | status = 0; |
2048 | if (__kmp_tasking_mode != tskm_immediate_exec) { |
2049 | if (this_thr->th.th_task_team != NULL) { |
2050 | #if USE_ITT_NOTIFY |
2051 | void *itt_sync_obj = NULL; |
2052 | if (__itt_sync_create_ptr || KMP_ITT_DEBUG) { |
2053 | itt_sync_obj = __kmp_itt_barrier_object(gtid, bt, set_name: 1); |
2054 | __kmp_itt_barrier_starting(gtid, object: itt_sync_obj); |
2055 | } |
2056 | #endif |
2057 | |
2058 | KMP_DEBUG_ASSERT( |
2059 | this_thr->th.th_task_team->tt.tt_found_proxy_tasks == TRUE || |
2060 | this_thr->th.th_task_team->tt.tt_hidden_helper_task_encountered == |
2061 | TRUE); |
2062 | __kmp_task_team_wait(this_thr, team USE_ITT_BUILD_ARG(itt_sync_obj)); |
2063 | __kmp_task_team_setup(this_thr, team); |
2064 | |
2065 | #if USE_ITT_BUILD |
2066 | if (__itt_sync_create_ptr || KMP_ITT_DEBUG) |
2067 | __kmp_itt_barrier_finished(gtid, object: itt_sync_obj); |
2068 | #endif /* USE_ITT_BUILD */ |
2069 | } |
2070 | } |
2071 | } |
2072 | KA_TRACE(15, ("__kmp_barrier: T#%d(%d:%d) is leaving with return value %d\n" , |
2073 | gtid, __kmp_team_from_gtid(gtid)->t.t_id, |
2074 | __kmp_tid_from_gtid(gtid), status)); |
2075 | |
2076 | #if OMPT_SUPPORT |
2077 | if (ompt_enabled.enabled) { |
2078 | #if OMPT_OPTIONAL |
2079 | if (ompt_enabled.ompt_callback_sync_region_wait) { |
2080 | ompt_callbacks.ompt_callback(ompt_callback_sync_region_wait)( |
2081 | barrier_kind, ompt_scope_end, my_parallel_data, my_task_data, |
2082 | return_address); |
2083 | } |
2084 | if (ompt_enabled.ompt_callback_sync_region) { |
2085 | ompt_callbacks.ompt_callback(ompt_callback_sync_region)( |
2086 | barrier_kind, ompt_scope_end, my_parallel_data, my_task_data, |
2087 | return_address); |
2088 | } |
2089 | #endif |
2090 | this_thr->th.ompt_thread_info.state = ompt_state_work_parallel; |
2091 | } |
2092 | #endif |
2093 | |
2094 | if (cancellable) |
2095 | return (int)cancelled; |
2096 | return status; |
2097 | } |
2098 | |
2099 | // Returns 0 if primary thread, 1 if worker thread. |
2100 | int __kmp_barrier(enum barrier_type bt, int gtid, int is_split, |
2101 | size_t reduce_size, void *reduce_data, |
2102 | void (*reduce)(void *, void *)) { |
2103 | return __kmp_barrier_template<>(bt, gtid, is_split, reduce_size, reduce_data, |
2104 | reduce); |
2105 | } |
2106 | |
2107 | #if defined(KMP_GOMP_COMPAT) |
2108 | // Returns 1 if cancelled, 0 otherwise |
2109 | int __kmp_barrier_gomp_cancel(int gtid) { |
2110 | if (__kmp_omp_cancellation) { |
2111 | int cancelled = __kmp_barrier_template<true>(bt: bs_plain_barrier, gtid, FALSE, |
2112 | reduce_size: 0, NULL, NULL); |
2113 | if (cancelled) { |
2114 | int tid = __kmp_tid_from_gtid(gtid); |
2115 | kmp_info_t *this_thr = __kmp_threads[gtid]; |
2116 | if (KMP_MASTER_TID(tid)) { |
2117 | // Primary thread does not need to revert anything |
2118 | } else { |
2119 | // Workers need to revert their private b_arrived flag |
2120 | this_thr->th.th_bar[bs_plain_barrier].bb.b_arrived -= |
2121 | KMP_BARRIER_STATE_BUMP; |
2122 | } |
2123 | } |
2124 | return cancelled; |
2125 | } |
2126 | __kmp_barrier(bt: bs_plain_barrier, gtid, FALSE, reduce_size: 0, NULL, NULL); |
2127 | return FALSE; |
2128 | } |
2129 | #endif |
2130 | |
2131 | void __kmp_end_split_barrier(enum barrier_type bt, int gtid) { |
2132 | KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_end_split_barrier); |
2133 | KMP_SET_THREAD_STATE_BLOCK(PLAIN_BARRIER); |
2134 | KMP_DEBUG_ASSERT(bt < bs_last_barrier); |
2135 | int tid = __kmp_tid_from_gtid(gtid); |
2136 | kmp_info_t *this_thr = __kmp_threads[gtid]; |
2137 | kmp_team_t *team = this_thr->th.th_team; |
2138 | |
2139 | if (!team->t.t_serialized) { |
2140 | if (KMP_MASTER_GTID(gtid)) { |
2141 | switch (__kmp_barrier_release_pattern[bt]) { |
2142 | case bp_dist_bar: { |
2143 | __kmp_dist_barrier_release(bt, this_thr, gtid, tid, |
2144 | FALSE USE_ITT_BUILD_ARG(NULL)); |
2145 | break; |
2146 | } |
2147 | case bp_hyper_bar: { |
2148 | KMP_ASSERT(__kmp_barrier_release_branch_bits[bt]); |
2149 | __kmp_hyper_barrier_release(bt, this_thr, gtid, tid, |
2150 | FALSE USE_ITT_BUILD_ARG(NULL)); |
2151 | break; |
2152 | } |
2153 | case bp_hierarchical_bar: { |
2154 | __kmp_hierarchical_barrier_release(bt, this_thr, gtid, tid, |
2155 | FALSE USE_ITT_BUILD_ARG(NULL)); |
2156 | break; |
2157 | } |
2158 | case bp_tree_bar: { |
2159 | KMP_ASSERT(__kmp_barrier_release_branch_bits[bt]); |
2160 | __kmp_tree_barrier_release(bt, this_thr, gtid, tid, |
2161 | FALSE USE_ITT_BUILD_ARG(NULL)); |
2162 | break; |
2163 | } |
2164 | default: { |
2165 | __kmp_linear_barrier_release(bt, this_thr, gtid, tid, |
2166 | FALSE USE_ITT_BUILD_ARG(NULL)); |
2167 | } |
2168 | } |
2169 | if (__kmp_tasking_mode != tskm_immediate_exec) { |
2170 | __kmp_task_team_sync(this_thr, team); |
2171 | } // if |
2172 | } |
2173 | } |
2174 | } |
2175 | |
2176 | void __kmp_join_barrier(int gtid) { |
2177 | KMP_TIME_PARTITIONED_BLOCK(OMP_join_barrier); |
2178 | KMP_SET_THREAD_STATE_BLOCK(FORK_JOIN_BARRIER); |
2179 | |
2180 | KMP_DEBUG_ASSERT(__kmp_threads && __kmp_threads[gtid]); |
2181 | |
2182 | kmp_info_t *this_thr = __kmp_threads[gtid]; |
2183 | kmp_team_t *team; |
2184 | int tid; |
2185 | #ifdef KMP_DEBUG |
2186 | int team_id; |
2187 | #endif /* KMP_DEBUG */ |
2188 | #if USE_ITT_BUILD |
2189 | void *itt_sync_obj = NULL; |
2190 | #if USE_ITT_NOTIFY |
2191 | if (__itt_sync_create_ptr || KMP_ITT_DEBUG) // Don't call routine without need |
2192 | // Get object created at fork_barrier |
2193 | itt_sync_obj = __kmp_itt_barrier_object(gtid, bt: bs_forkjoin_barrier); |
2194 | #endif |
2195 | #endif /* USE_ITT_BUILD */ |
2196 | #if ((USE_ITT_BUILD && USE_ITT_NOTIFY) || defined KMP_DEBUG) |
2197 | int nproc = this_thr->th.th_team_nproc; |
2198 | #endif |
2199 | KMP_MB(); |
2200 | |
2201 | // Get current info |
2202 | team = this_thr->th.th_team; |
2203 | KMP_DEBUG_ASSERT(nproc == team->t.t_nproc); |
2204 | tid = __kmp_tid_from_gtid(gtid); |
2205 | #ifdef KMP_DEBUG |
2206 | team_id = team->t.t_id; |
2207 | kmp_info_t *master_thread = this_thr->th.th_team_master; |
2208 | if (master_thread != team->t.t_threads[0]) { |
2209 | __kmp_print_structure(); |
2210 | } |
2211 | #endif /* KMP_DEBUG */ |
2212 | KMP_DEBUG_ASSERT(master_thread == team->t.t_threads[0]); |
2213 | KMP_MB(); |
2214 | |
2215 | // Verify state |
2216 | KMP_DEBUG_ASSERT(TCR_PTR(this_thr->th.th_team)); |
2217 | KMP_DEBUG_ASSERT(TCR_PTR(this_thr->th.th_root)); |
2218 | KMP_DEBUG_ASSERT(this_thr == team->t.t_threads[tid]); |
2219 | KA_TRACE(10, ("__kmp_join_barrier: T#%d(%d:%d) arrived at join barrier\n" , |
2220 | gtid, team_id, tid)); |
2221 | |
2222 | #if OMPT_SUPPORT |
2223 | if (ompt_enabled.enabled) { |
2224 | #if OMPT_OPTIONAL |
2225 | ompt_data_t *my_task_data; |
2226 | ompt_data_t *my_parallel_data; |
2227 | void *codeptr = NULL; |
2228 | int ds_tid = this_thr->th.th_info.ds.ds_tid; |
2229 | if (KMP_MASTER_TID(ds_tid) && |
2230 | (ompt_callbacks.ompt_callback(ompt_callback_sync_region_wait) || |
2231 | ompt_callbacks.ompt_callback(ompt_callback_sync_region))) |
2232 | codeptr = team->t.ompt_team_info.master_return_address; |
2233 | my_task_data = OMPT_CUR_TASK_DATA(this_thr); |
2234 | my_parallel_data = OMPT_CUR_TEAM_DATA(this_thr); |
2235 | ompt_sync_region_t sync_kind = ompt_sync_region_barrier_implicit_parallel; |
2236 | ompt_state_t ompt_state = ompt_state_wait_barrier_implicit_parallel; |
2237 | if (this_thr->th.ompt_thread_info.parallel_flags & ompt_parallel_league) { |
2238 | sync_kind = ompt_sync_region_barrier_teams; |
2239 | ompt_state = ompt_state_wait_barrier_teams; |
2240 | } |
2241 | if (ompt_enabled.ompt_callback_sync_region) { |
2242 | ompt_callbacks.ompt_callback(ompt_callback_sync_region)( |
2243 | sync_kind, ompt_scope_begin, my_parallel_data, my_task_data, codeptr); |
2244 | } |
2245 | if (ompt_enabled.ompt_callback_sync_region_wait) { |
2246 | ompt_callbacks.ompt_callback(ompt_callback_sync_region_wait)( |
2247 | sync_kind, ompt_scope_begin, my_parallel_data, my_task_data, codeptr); |
2248 | } |
2249 | if (!KMP_MASTER_TID(ds_tid)) |
2250 | this_thr->th.ompt_thread_info.task_data = *OMPT_CUR_TASK_DATA(this_thr); |
2251 | #endif |
2252 | this_thr->th.ompt_thread_info.state = ompt_state; |
2253 | } |
2254 | #endif |
2255 | |
2256 | if (__kmp_tasking_mode == tskm_extra_barrier) { |
2257 | __kmp_tasking_barrier(team, thread: this_thr, gtid); |
2258 | KA_TRACE(10, ("__kmp_join_barrier: T#%d(%d:%d) past tasking barrier\n" , |
2259 | gtid, team_id, tid)); |
2260 | } |
2261 | #ifdef KMP_DEBUG |
2262 | if (__kmp_tasking_mode != tskm_immediate_exec) { |
2263 | KA_TRACE(20, ("__kmp_join_barrier: T#%d, old team = %d, old task_team = " |
2264 | "%p, th_task_team = %p\n" , |
2265 | __kmp_gtid_from_thread(this_thr), team_id, |
2266 | team->t.t_task_team[this_thr->th.th_task_state], |
2267 | this_thr->th.th_task_team)); |
2268 | KMP_DEBUG_ASSERT_TASKTEAM_INVARIANT(team, this_thr); |
2269 | } |
2270 | #endif /* KMP_DEBUG */ |
2271 | |
2272 | /* Copy the blocktime info to the thread, where __kmp_wait_template() can |
2273 | access it when the team struct is not guaranteed to exist. Doing these |
2274 | loads causes a cache miss slows down EPCC parallel by 2x. As a workaround, |
2275 | we do not perform the copy if blocktime=infinite, since the values are not |
2276 | used by __kmp_wait_template() in that case. */ |
2277 | if (__kmp_dflt_blocktime != KMP_MAX_BLOCKTIME) { |
2278 | #if KMP_USE_MONITOR |
2279 | this_thr->th.th_team_bt_intervals = |
2280 | team->t.t_implicit_task_taskdata[tid].td_icvs.bt_intervals; |
2281 | this_thr->th.th_team_bt_set = |
2282 | team->t.t_implicit_task_taskdata[tid].td_icvs.bt_set; |
2283 | #else |
2284 | this_thr->th.th_team_bt_intervals = KMP_BLOCKTIME_INTERVAL(team, tid); |
2285 | #endif |
2286 | } |
2287 | |
2288 | #if USE_ITT_BUILD |
2289 | if (__itt_sync_create_ptr || KMP_ITT_DEBUG) |
2290 | __kmp_itt_barrier_starting(gtid, object: itt_sync_obj); |
2291 | #endif /* USE_ITT_BUILD */ |
2292 | |
2293 | switch (__kmp_barrier_gather_pattern[bs_forkjoin_barrier]) { |
2294 | case bp_dist_bar: { |
2295 | __kmp_dist_barrier_gather(bt: bs_forkjoin_barrier, this_thr, gtid, tid, |
2296 | NULL USE_ITT_BUILD_ARG(itt_sync_obj)); |
2297 | break; |
2298 | } |
2299 | case bp_hyper_bar: { |
2300 | KMP_ASSERT(__kmp_barrier_gather_branch_bits[bs_forkjoin_barrier]); |
2301 | __kmp_hyper_barrier_gather(bt: bs_forkjoin_barrier, this_thr, gtid, tid, |
2302 | NULL USE_ITT_BUILD_ARG(itt_sync_obj)); |
2303 | break; |
2304 | } |
2305 | case bp_hierarchical_bar: { |
2306 | __kmp_hierarchical_barrier_gather(bt: bs_forkjoin_barrier, this_thr, gtid, tid, |
2307 | NULL USE_ITT_BUILD_ARG(itt_sync_obj)); |
2308 | break; |
2309 | } |
2310 | case bp_tree_bar: { |
2311 | KMP_ASSERT(__kmp_barrier_gather_branch_bits[bs_forkjoin_barrier]); |
2312 | __kmp_tree_barrier_gather(bt: bs_forkjoin_barrier, this_thr, gtid, tid, |
2313 | NULL USE_ITT_BUILD_ARG(itt_sync_obj)); |
2314 | break; |
2315 | } |
2316 | default: { |
2317 | __kmp_linear_barrier_gather(bt: bs_forkjoin_barrier, this_thr, gtid, tid, |
2318 | NULL USE_ITT_BUILD_ARG(itt_sync_obj)); |
2319 | } |
2320 | } |
2321 | |
2322 | /* From this point on, the team data structure may be deallocated at any time |
2323 | by the primary thread - it is unsafe to reference it in any of the worker |
2324 | threads. Any per-team data items that need to be referenced before the |
2325 | end of the barrier should be moved to the kmp_task_team_t structs. */ |
2326 | if (KMP_MASTER_TID(tid)) { |
2327 | if (__kmp_tasking_mode != tskm_immediate_exec) { |
2328 | __kmp_task_team_wait(this_thr, team USE_ITT_BUILD_ARG(itt_sync_obj)); |
2329 | } |
2330 | if (__kmp_display_affinity) { |
2331 | KMP_CHECK_UPDATE(team->t.t_display_affinity, 0); |
2332 | } |
2333 | #if KMP_STATS_ENABLED |
2334 | // Have primary thread flag the workers to indicate they are now waiting for |
2335 | // next parallel region, Also wake them up so they switch their timers to |
2336 | // idle. |
2337 | for (int i = 0; i < team->t.t_nproc; ++i) { |
2338 | kmp_info_t *team_thread = team->t.t_threads[i]; |
2339 | if (team_thread == this_thr) |
2340 | continue; |
2341 | team_thread->th.th_stats->setIdleFlag(); |
2342 | if (__kmp_dflt_blocktime != KMP_MAX_BLOCKTIME && |
2343 | team_thread->th.th_sleep_loc != NULL) |
2344 | __kmp_null_resume_wrapper(team_thread); |
2345 | } |
2346 | #endif |
2347 | #if USE_ITT_BUILD |
2348 | if (__itt_sync_create_ptr || KMP_ITT_DEBUG) |
2349 | __kmp_itt_barrier_middle(gtid, object: itt_sync_obj); |
2350 | #endif /* USE_ITT_BUILD */ |
2351 | |
2352 | #if USE_ITT_BUILD && USE_ITT_NOTIFY |
2353 | // Join barrier - report frame end |
2354 | if ((__itt_frame_submit_v3_ptr || KMP_ITT_DEBUG) && |
2355 | __kmp_forkjoin_frames_mode && |
2356 | (this_thr->th.th_teams_microtask == NULL || // either not in teams |
2357 | this_thr->th.th_teams_size.nteams == 1) && // or inside single team |
2358 | team->t.t_active_level == 1) { |
2359 | kmp_uint64 cur_time = __itt_get_timestamp(); |
2360 | ident_t *loc = team->t.t_ident; |
2361 | kmp_info_t **other_threads = team->t.t_threads; |
2362 | switch (__kmp_forkjoin_frames_mode) { |
2363 | case 1: |
2364 | __kmp_itt_frame_submit(gtid, begin: this_thr->th.th_frame_time, end: cur_time, imbalance: 0, |
2365 | loc, team_size: nproc); |
2366 | break; |
2367 | case 2: |
2368 | __kmp_itt_frame_submit(gtid, begin: this_thr->th.th_bar_min_time, end: cur_time, imbalance: 1, |
2369 | loc, team_size: nproc); |
2370 | break; |
2371 | case 3: |
2372 | if (__itt_metadata_add_ptr) { |
2373 | // Initialize with primary thread's wait time |
2374 | kmp_uint64 delta = cur_time - this_thr->th.th_bar_arrive_time; |
2375 | // Set arrive time to zero to be able to check it in |
2376 | // __kmp_invoke_task(); the same is done inside the loop below |
2377 | this_thr->th.th_bar_arrive_time = 0; |
2378 | for (int i = 1; i < nproc; ++i) { |
2379 | delta += (cur_time - other_threads[i]->th.th_bar_arrive_time); |
2380 | other_threads[i]->th.th_bar_arrive_time = 0; |
2381 | } |
2382 | __kmp_itt_metadata_imbalance(gtid, begin: this_thr->th.th_frame_time, |
2383 | end: cur_time, imbalance: delta, reduction: 0); |
2384 | } |
2385 | __kmp_itt_frame_submit(gtid, begin: this_thr->th.th_frame_time, end: cur_time, imbalance: 0, |
2386 | loc, team_size: nproc); |
2387 | this_thr->th.th_frame_time = cur_time; |
2388 | break; |
2389 | } |
2390 | } |
2391 | #endif /* USE_ITT_BUILD */ |
2392 | } |
2393 | #if USE_ITT_BUILD |
2394 | else { |
2395 | if (__itt_sync_create_ptr || KMP_ITT_DEBUG) |
2396 | __kmp_itt_barrier_middle(gtid, object: itt_sync_obj); |
2397 | } |
2398 | #endif /* USE_ITT_BUILD */ |
2399 | |
2400 | #if KMP_DEBUG |
2401 | if (KMP_MASTER_TID(tid)) { |
2402 | KA_TRACE( |
2403 | 15, |
2404 | ("__kmp_join_barrier: T#%d(%d:%d) says all %d team threads arrived\n" , |
2405 | gtid, team_id, tid, nproc)); |
2406 | } |
2407 | #endif /* KMP_DEBUG */ |
2408 | |
2409 | // TODO now, mark worker threads as done so they may be disbanded |
2410 | KMP_MB(); // Flush all pending memory write invalidates. |
2411 | KA_TRACE(10, |
2412 | ("__kmp_join_barrier: T#%d(%d:%d) leaving\n" , gtid, team_id, tid)); |
2413 | |
2414 | } |
2415 | |
2416 | // TODO release worker threads' fork barriers as we are ready instead of all at |
2417 | // once |
2418 | void __kmp_fork_barrier(int gtid, int tid) { |
2419 | KMP_TIME_PARTITIONED_BLOCK(OMP_fork_barrier); |
2420 | KMP_SET_THREAD_STATE_BLOCK(FORK_JOIN_BARRIER); |
2421 | kmp_info_t *this_thr = __kmp_threads[gtid]; |
2422 | kmp_team_t *team = (tid == 0) ? this_thr->th.th_team : NULL; |
2423 | #if USE_ITT_BUILD |
2424 | void *itt_sync_obj = NULL; |
2425 | #endif /* USE_ITT_BUILD */ |
2426 | #ifdef KMP_DEBUG |
2427 | if (team) |
2428 | KA_TRACE(10, ("__kmp_fork_barrier: T#%d(%d:%d) has arrived\n" , gtid, |
2429 | (team != NULL) ? team->t.t_id : -1, tid)); |
2430 | #endif |
2431 | // th_team pointer only valid for primary thread here |
2432 | if (KMP_MASTER_TID(tid)) { |
2433 | #if USE_ITT_BUILD && USE_ITT_NOTIFY |
2434 | if (__itt_sync_create_ptr || KMP_ITT_DEBUG) { |
2435 | // Create itt barrier object |
2436 | itt_sync_obj = __kmp_itt_barrier_object(gtid, bt: bs_forkjoin_barrier, set_name: 1); |
2437 | __kmp_itt_barrier_middle(gtid, object: itt_sync_obj); // Call acquired/releasing |
2438 | } |
2439 | #endif /* USE_ITT_BUILD && USE_ITT_NOTIFY */ |
2440 | |
2441 | #ifdef KMP_DEBUG |
2442 | KMP_DEBUG_ASSERT(team); |
2443 | kmp_info_t **other_threads = team->t.t_threads; |
2444 | int i; |
2445 | |
2446 | // Verify state |
2447 | KMP_MB(); |
2448 | |
2449 | for (i = 1; i < team->t.t_nproc; ++i) { |
2450 | KA_TRACE(500, |
2451 | ("__kmp_fork_barrier: T#%d(%d:0) checking T#%d(%d:%d) fork go " |
2452 | "== %u.\n" , |
2453 | gtid, team->t.t_id, other_threads[i]->th.th_info.ds.ds_gtid, |
2454 | team->t.t_id, other_threads[i]->th.th_info.ds.ds_tid, |
2455 | other_threads[i]->th.th_bar[bs_forkjoin_barrier].bb.b_go)); |
2456 | KMP_DEBUG_ASSERT( |
2457 | (TCR_4(other_threads[i]->th.th_bar[bs_forkjoin_barrier].bb.b_go) & |
2458 | ~(KMP_BARRIER_SLEEP_STATE)) == KMP_INIT_BARRIER_STATE); |
2459 | KMP_DEBUG_ASSERT(other_threads[i]->th.th_team == team); |
2460 | } |
2461 | #endif |
2462 | |
2463 | if (__kmp_tasking_mode != tskm_immediate_exec) |
2464 | __kmp_task_team_setup(this_thr, team); |
2465 | |
2466 | /* The primary thread may have changed its blocktime between join barrier |
2467 | and fork barrier. Copy the blocktime info to the thread, where |
2468 | __kmp_wait_template() can access it when the team struct is not |
2469 | guaranteed to exist. */ |
2470 | // See note about the corresponding code in __kmp_join_barrier() being |
2471 | // performance-critical |
2472 | if (__kmp_dflt_blocktime != KMP_MAX_BLOCKTIME) { |
2473 | #if KMP_USE_MONITOR |
2474 | this_thr->th.th_team_bt_intervals = |
2475 | team->t.t_implicit_task_taskdata[tid].td_icvs.bt_intervals; |
2476 | this_thr->th.th_team_bt_set = |
2477 | team->t.t_implicit_task_taskdata[tid].td_icvs.bt_set; |
2478 | #else |
2479 | this_thr->th.th_team_bt_intervals = KMP_BLOCKTIME_INTERVAL(team, tid); |
2480 | #endif |
2481 | } |
2482 | } // primary thread |
2483 | |
2484 | switch (__kmp_barrier_release_pattern[bs_forkjoin_barrier]) { |
2485 | case bp_dist_bar: { |
2486 | __kmp_dist_barrier_release(bt: bs_forkjoin_barrier, this_thr, gtid, tid, |
2487 | TRUE USE_ITT_BUILD_ARG(NULL)); |
2488 | break; |
2489 | } |
2490 | case bp_hyper_bar: { |
2491 | KMP_ASSERT(__kmp_barrier_release_branch_bits[bs_forkjoin_barrier]); |
2492 | __kmp_hyper_barrier_release(bt: bs_forkjoin_barrier, this_thr, gtid, tid, |
2493 | TRUE USE_ITT_BUILD_ARG(itt_sync_obj)); |
2494 | break; |
2495 | } |
2496 | case bp_hierarchical_bar: { |
2497 | __kmp_hierarchical_barrier_release(bt: bs_forkjoin_barrier, this_thr, gtid, tid, |
2498 | TRUE USE_ITT_BUILD_ARG(itt_sync_obj)); |
2499 | break; |
2500 | } |
2501 | case bp_tree_bar: { |
2502 | KMP_ASSERT(__kmp_barrier_release_branch_bits[bs_forkjoin_barrier]); |
2503 | __kmp_tree_barrier_release(bt: bs_forkjoin_barrier, this_thr, gtid, tid, |
2504 | TRUE USE_ITT_BUILD_ARG(itt_sync_obj)); |
2505 | break; |
2506 | } |
2507 | default: { |
2508 | __kmp_linear_barrier_release(bt: bs_forkjoin_barrier, this_thr, gtid, tid, |
2509 | TRUE USE_ITT_BUILD_ARG(itt_sync_obj)); |
2510 | } |
2511 | } |
2512 | |
2513 | #if OMPT_SUPPORT |
2514 | ompt_state_t ompt_state = this_thr->th.ompt_thread_info.state; |
2515 | if (ompt_enabled.enabled && |
2516 | (ompt_state == ompt_state_wait_barrier_teams || |
2517 | ompt_state == ompt_state_wait_barrier_implicit_parallel)) { |
2518 | int ds_tid = this_thr->th.th_info.ds.ds_tid; |
2519 | ompt_data_t *task_data = (team) |
2520 | ? OMPT_CUR_TASK_DATA(this_thr) |
2521 | : &(this_thr->th.ompt_thread_info.task_data); |
2522 | this_thr->th.ompt_thread_info.state = ompt_state_overhead; |
2523 | #if OMPT_OPTIONAL |
2524 | void *codeptr = NULL; |
2525 | if (KMP_MASTER_TID(ds_tid) && |
2526 | (ompt_callbacks.ompt_callback(ompt_callback_sync_region_wait) || |
2527 | ompt_callbacks.ompt_callback(ompt_callback_sync_region))) |
2528 | codeptr = team ? team->t.ompt_team_info.master_return_address : NULL; |
2529 | ompt_sync_region_t sync_kind = ompt_sync_region_barrier_implicit_parallel; |
2530 | if (this_thr->th.ompt_thread_info.parallel_flags & ompt_parallel_league) |
2531 | sync_kind = ompt_sync_region_barrier_teams; |
2532 | if (ompt_enabled.ompt_callback_sync_region_wait) { |
2533 | ompt_callbacks.ompt_callback(ompt_callback_sync_region_wait)( |
2534 | sync_kind, ompt_scope_end, NULL, task_data, codeptr); |
2535 | } |
2536 | if (ompt_enabled.ompt_callback_sync_region) { |
2537 | ompt_callbacks.ompt_callback(ompt_callback_sync_region)( |
2538 | sync_kind, ompt_scope_end, NULL, task_data, codeptr); |
2539 | } |
2540 | #endif |
2541 | if (!KMP_MASTER_TID(ds_tid) && ompt_enabled.ompt_callback_implicit_task) { |
2542 | ompt_callbacks.ompt_callback(ompt_callback_implicit_task)( |
2543 | ompt_scope_end, NULL, task_data, 0, ds_tid, |
2544 | ompt_task_implicit); // TODO: Can this be ompt_task_initial? |
2545 | } |
2546 | } |
2547 | #endif |
2548 | |
2549 | // Early exit for reaping threads releasing forkjoin barrier |
2550 | if (TCR_4(__kmp_global.g.g_done)) { |
2551 | this_thr->th.th_task_team = NULL; |
2552 | |
2553 | #if USE_ITT_BUILD && USE_ITT_NOTIFY |
2554 | if (__itt_sync_create_ptr || KMP_ITT_DEBUG) { |
2555 | if (!KMP_MASTER_TID(tid)) { |
2556 | itt_sync_obj = __kmp_itt_barrier_object(gtid, bt: bs_forkjoin_barrier); |
2557 | if (itt_sync_obj) |
2558 | __kmp_itt_barrier_finished(gtid, object: itt_sync_obj); |
2559 | } |
2560 | } |
2561 | #endif /* USE_ITT_BUILD && USE_ITT_NOTIFY */ |
2562 | KA_TRACE(10, ("__kmp_fork_barrier: T#%d is leaving early\n" , gtid)); |
2563 | return; |
2564 | } |
2565 | |
2566 | /* We can now assume that a valid team structure has been allocated by the |
2567 | primary thread and propagated to all worker threads. The current thread, |
2568 | however, may not be part of the team, so we can't blindly assume that the |
2569 | team pointer is non-null. */ |
2570 | team = (kmp_team_t *)TCR_PTR(this_thr->th.th_team); |
2571 | KMP_DEBUG_ASSERT(team != NULL); |
2572 | tid = __kmp_tid_from_gtid(gtid); |
2573 | |
2574 | #if KMP_BARRIER_ICV_PULL |
2575 | /* Primary thread's copy of the ICVs was set up on the implicit taskdata in |
2576 | __kmp_reinitialize_team. __kmp_fork_call() assumes the primary thread's |
2577 | implicit task has this data before this function is called. We cannot |
2578 | modify __kmp_fork_call() to look at the fixed ICVs in the primary thread's |
2579 | thread struct, because it is not always the case that the threads arrays |
2580 | have been allocated when __kmp_fork_call() is executed. */ |
2581 | { |
2582 | KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(USER_icv_copy); |
2583 | if (!KMP_MASTER_TID(tid)) { // primary thread already has ICVs |
2584 | // Copy the initial ICVs from the primary thread's thread struct to the |
2585 | // implicit task for this tid. |
2586 | KA_TRACE(10, |
2587 | ("__kmp_fork_barrier: T#%d(%d) is PULLing ICVs\n" , gtid, tid)); |
2588 | __kmp_init_implicit_task(team->t.t_ident, team->t.t_threads[tid], team, |
2589 | tid, FALSE); |
2590 | copy_icvs(&team->t.t_implicit_task_taskdata[tid].td_icvs, |
2591 | &team->t.t_threads[0] |
2592 | ->th.th_bar[bs_forkjoin_barrier] |
2593 | .bb.th_fixed_icvs); |
2594 | } |
2595 | } |
2596 | #endif // KMP_BARRIER_ICV_PULL |
2597 | |
2598 | if (__kmp_tasking_mode != tskm_immediate_exec) { |
2599 | __kmp_task_team_sync(this_thr, team); |
2600 | } |
2601 | |
2602 | #if KMP_AFFINITY_SUPPORTED |
2603 | kmp_proc_bind_t proc_bind = team->t.t_proc_bind; |
2604 | if (proc_bind == proc_bind_intel) { |
2605 | // Call dynamic affinity settings |
2606 | if (__kmp_affinity.type == affinity_balanced && team->t.t_size_changed) { |
2607 | __kmp_balanced_affinity(th: this_thr, team_size: team->t.t_nproc); |
2608 | } |
2609 | } else if (proc_bind != proc_bind_false) { |
2610 | if (this_thr->th.th_new_place == this_thr->th.th_current_place) { |
2611 | KA_TRACE(100, ("__kmp_fork_barrier: T#%d already in correct place %d\n" , |
2612 | __kmp_gtid_from_thread(this_thr), |
2613 | this_thr->th.th_current_place)); |
2614 | } else { |
2615 | __kmp_affinity_bind_place(gtid); |
2616 | } |
2617 | } |
2618 | #endif // KMP_AFFINITY_SUPPORTED |
2619 | // Perform the display affinity functionality |
2620 | if (__kmp_display_affinity) { |
2621 | if (team->t.t_display_affinity |
2622 | #if KMP_AFFINITY_SUPPORTED |
2623 | || (__kmp_affinity.type == affinity_balanced && team->t.t_size_changed) |
2624 | #endif |
2625 | ) { |
2626 | // NULL means use the affinity-format-var ICV |
2627 | __kmp_aux_display_affinity(gtid, NULL); |
2628 | this_thr->th.th_prev_num_threads = team->t.t_nproc; |
2629 | this_thr->th.th_prev_level = team->t.t_level; |
2630 | } |
2631 | } |
2632 | if (!KMP_MASTER_TID(tid)) |
2633 | KMP_CHECK_UPDATE(this_thr->th.th_def_allocator, team->t.t_def_allocator); |
2634 | |
2635 | #if USE_ITT_BUILD && USE_ITT_NOTIFY |
2636 | if (__itt_sync_create_ptr || KMP_ITT_DEBUG) { |
2637 | if (!KMP_MASTER_TID(tid)) { |
2638 | // Get correct barrier object |
2639 | itt_sync_obj = __kmp_itt_barrier_object(gtid, bt: bs_forkjoin_barrier); |
2640 | __kmp_itt_barrier_finished(gtid, object: itt_sync_obj); // Workers call acquired |
2641 | } // (prepare called inside barrier_release) |
2642 | } |
2643 | #endif /* USE_ITT_BUILD && USE_ITT_NOTIFY */ |
2644 | KA_TRACE(10, ("__kmp_fork_barrier: T#%d(%d:%d) is leaving\n" , gtid, |
2645 | team->t.t_id, tid)); |
2646 | } |
2647 | |
2648 | void __kmp_setup_icv_copy(kmp_team_t *team, int new_nproc, |
2649 | kmp_internal_control_t *new_icvs, ident_t *loc) { |
2650 | KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_setup_icv_copy); |
2651 | |
2652 | KMP_DEBUG_ASSERT(team && new_nproc && new_icvs); |
2653 | KMP_DEBUG_ASSERT((!TCR_4(__kmp_init_parallel)) || new_icvs->nproc); |
2654 | |
2655 | /* Primary thread's copy of the ICVs was set up on the implicit taskdata in |
2656 | __kmp_reinitialize_team. __kmp_fork_call() assumes the primary thread's |
2657 | implicit task has this data before this function is called. */ |
2658 | #if KMP_BARRIER_ICV_PULL |
2659 | /* Copy ICVs to primary thread's thread structure into th_fixed_icvs (which |
2660 | remains untouched), where all of the worker threads can access them and |
2661 | make their own copies after the barrier. */ |
2662 | KMP_DEBUG_ASSERT(team->t.t_threads[0]); // The threads arrays should be |
2663 | // allocated at this point |
2664 | copy_icvs( |
2665 | &team->t.t_threads[0]->th.th_bar[bs_forkjoin_barrier].bb.th_fixed_icvs, |
2666 | new_icvs); |
2667 | KF_TRACE(10, ("__kmp_setup_icv_copy: PULL: T#%d this_thread=%p team=%p\n" , 0, |
2668 | team->t.t_threads[0], team)); |
2669 | #elif KMP_BARRIER_ICV_PUSH |
2670 | // The ICVs will be propagated in the fork barrier, so nothing needs to be |
2671 | // done here. |
2672 | KF_TRACE(10, ("__kmp_setup_icv_copy: PUSH: T#%d this_thread=%p team=%p\n" , 0, |
2673 | team->t.t_threads[0], team)); |
2674 | #else |
2675 | // Copy the ICVs to each of the non-primary threads. This takes O(nthreads) |
2676 | // time. |
2677 | ngo_load(new_icvs); |
2678 | KMP_DEBUG_ASSERT(team->t.t_threads[0]); // The threads arrays should be |
2679 | // allocated at this point |
2680 | for (int f = 1; f < new_nproc; ++f) { // Skip the primary thread |
2681 | // TODO: GEH - pass in better source location info since usually NULL here |
2682 | KF_TRACE(10, ("__kmp_setup_icv_copy: LINEAR: T#%d this_thread=%p team=%p\n" , |
2683 | f, team->t.t_threads[f], team)); |
2684 | __kmp_init_implicit_task(loc, team->t.t_threads[f], team, f, FALSE); |
2685 | ngo_store_icvs(&team->t.t_implicit_task_taskdata[f].td_icvs, new_icvs); |
2686 | KF_TRACE(10, ("__kmp_setup_icv_copy: LINEAR: T#%d this_thread=%p team=%p\n" , |
2687 | f, team->t.t_threads[f], team)); |
2688 | } |
2689 | ngo_sync(); |
2690 | #endif // KMP_BARRIER_ICV_PULL |
2691 | } |
2692 | |