1 | //===- Synchronization.cpp - OpenMP Device synchronization API ---- c++ -*-===// |
---|---|
2 | // |
3 | // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. |
4 | // See https://llvm.org/LICENSE.txt for license information. |
5 | // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception |
6 | // |
7 | //===----------------------------------------------------------------------===// |
8 | // |
9 | // Include all synchronization. |
10 | // |
11 | //===----------------------------------------------------------------------===// |
12 | |
13 | #include "Synchronization.h" |
14 | |
15 | #include "Debug.h" |
16 | #include "DeviceTypes.h" |
17 | #include "DeviceUtils.h" |
18 | #include "Interface.h" |
19 | #include "Mapping.h" |
20 | #include "State.h" |
21 | |
22 | using namespace ompx; |
23 | |
24 | namespace impl { |
25 | |
26 | /// Atomics |
27 | /// |
28 | ///{ |
29 | ///} |
30 | |
31 | /// AMDGCN Implementation |
32 | /// |
33 | ///{ |
34 | #ifdef __AMDGPU__ |
35 | |
36 | uint32_t atomicInc(uint32_t *A, uint32_t V, atomic::OrderingTy Ordering, |
37 | atomic::MemScopeTy MemScope) { |
38 | // builtin_amdgcn_atomic_inc32 should expand to this switch when |
39 | // passed a runtime value, but does not do so yet. Workaround here. |
40 | |
41 | #define ScopeSwitch(ORDER) \ |
42 | switch (MemScope) { \ |
43 | case atomic::MemScopeTy::system: \ |
44 | return __builtin_amdgcn_atomic_inc32(A, V, ORDER, ""); \ |
45 | case atomic::MemScopeTy::device: \ |
46 | return __builtin_amdgcn_atomic_inc32(A, V, ORDER, "agent"); \ |
47 | case atomic::MemScopeTy::workgroup: \ |
48 | return __builtin_amdgcn_atomic_inc32(A, V, ORDER, "workgroup"); \ |
49 | case atomic::MemScopeTy::wavefront: \ |
50 | return __builtin_amdgcn_atomic_inc32(A, V, ORDER, "wavefront"); \ |
51 | case atomic::MemScopeTy::single: \ |
52 | return __builtin_amdgcn_atomic_inc32(A, V, ORDER, "singlethread"); \ |
53 | } |
54 | |
55 | #define Case(ORDER) \ |
56 | case ORDER: \ |
57 | ScopeSwitch(ORDER) |
58 | |
59 | switch (Ordering) { |
60 | default: |
61 | __builtin_unreachable(); |
62 | Case(atomic::relaxed); |
63 | Case(atomic::acquire); |
64 | Case(atomic::release); |
65 | Case(atomic::acq_rel); |
66 | Case(atomic::seq_cst); |
67 | #undef Case |
68 | #undef ScopeSwitch |
69 | } |
70 | } |
71 | |
72 | [[clang::loader_uninitialized]] Local<uint32_t> namedBarrierTracker; |
73 | |
74 | void namedBarrierInit() { |
75 | // Don't have global ctors, and shared memory is not zero init |
76 | atomic::store(&namedBarrierTracker, 0u, atomic::release); |
77 | } |
78 | |
79 | void namedBarrier() { |
80 | uint32_t NumThreads = omp_get_num_threads(); |
81 | // assert(NumThreads % 32 == 0); |
82 | |
83 | uint32_t WarpSize = mapping::getWarpSize(); |
84 | uint32_t NumWaves = NumThreads / WarpSize; |
85 | |
86 | fence::team(atomic::acquire); |
87 | |
88 | // named barrier implementation for amdgcn. |
89 | // Uses two 16 bit unsigned counters. One for the number of waves to have |
90 | // reached the barrier, and one to count how many times the barrier has been |
91 | // passed. These are packed in a single atomically accessed 32 bit integer. |
92 | // Low bits for the number of waves, assumed zero before this call. |
93 | // High bits to count the number of times the barrier has been passed. |
94 | |
95 | // precondition: NumWaves != 0; |
96 | // invariant: NumWaves * WarpSize == NumThreads; |
97 | // precondition: NumWaves < 0xffffu; |
98 | |
99 | // Increment the low 16 bits once, using the lowest active thread. |
100 | if (mapping::isLeaderInWarp()) { |
101 | uint32_t load = atomic::add(&namedBarrierTracker, 1, |
102 | atomic::relaxed); // commutative |
103 | |
104 | // Record the number of times the barrier has been passed |
105 | uint32_t generation = load & 0xffff0000u; |
106 | |
107 | if ((load & 0x0000ffffu) == (NumWaves - 1)) { |
108 | // Reached NumWaves in low bits so this is the last wave. |
109 | // Set low bits to zero and increment high bits |
110 | load += 0x00010000u; // wrap is safe |
111 | load &= 0xffff0000u; // because bits zeroed second |
112 | |
113 | // Reset the wave counter and release the waiting waves |
114 | atomic::store(&namedBarrierTracker, load, atomic::relaxed); |
115 | } else { |
116 | // more waves still to go, spin until generation counter changes |
117 | do { |
118 | __builtin_amdgcn_s_sleep(0); |
119 | load = atomic::load(&namedBarrierTracker, atomic::relaxed); |
120 | } while ((load & 0xffff0000u) == generation); |
121 | } |
122 | } |
123 | fence::team(atomic::release); |
124 | } |
125 | |
126 | void fenceTeam(atomic::OrderingTy Ordering) { |
127 | return __scoped_atomic_thread_fence(Ordering, atomic::workgroup); |
128 | } |
129 | |
130 | void fenceKernel(atomic::OrderingTy Ordering) { |
131 | return __scoped_atomic_thread_fence(Ordering, atomic::device); |
132 | } |
133 | |
134 | void fenceSystem(atomic::OrderingTy Ordering) { |
135 | return __scoped_atomic_thread_fence(Ordering, atomic::system); |
136 | } |
137 | |
138 | void syncWarp(__kmpc_impl_lanemask_t) { |
139 | // This is a no-op on current AMDGPU hardware but it is used by the optimizer |
140 | // to enforce convergent behaviour between control flow graphs. |
141 | __builtin_amdgcn_wave_barrier(); |
142 | } |
143 | |
144 | void syncThreads(atomic::OrderingTy Ordering) { |
145 | if (Ordering != atomic::relaxed) |
146 | fenceTeam(Ordering == atomic::acq_rel ? atomic::release : atomic::seq_cst); |
147 | |
148 | __builtin_amdgcn_s_barrier(); |
149 | |
150 | if (Ordering != atomic::relaxed) |
151 | fenceTeam(Ordering == atomic::acq_rel ? atomic::acquire : atomic::seq_cst); |
152 | } |
153 | void syncThreadsAligned(atomic::OrderingTy Ordering) { syncThreads(Ordering); } |
154 | |
155 | // TODO: Don't have wavefront lane locks. Possibly can't have them. |
156 | void unsetLock(omp_lock_t *) { __builtin_trap(); } |
157 | int testLock(omp_lock_t *) { __builtin_trap(); } |
158 | void initLock(omp_lock_t *) { __builtin_trap(); } |
159 | void destroyLock(omp_lock_t *) { __builtin_trap(); } |
160 | void setLock(omp_lock_t *) { __builtin_trap(); } |
161 | |
162 | constexpr uint32_t UNSET = 0; |
163 | constexpr uint32_t SET = 1; |
164 | |
165 | void unsetCriticalLock(omp_lock_t *Lock) { |
166 | (void)atomicExchange((uint32_t *)Lock, UNSET, atomic::acq_rel); |
167 | } |
168 | |
169 | void setCriticalLock(omp_lock_t *Lock) { |
170 | uint64_t LowestActiveThread = utils::ffs(mapping::activemask()) - 1; |
171 | if (mapping::getThreadIdInWarp() == LowestActiveThread) { |
172 | fenceKernel(atomic::release); |
173 | while ( |
174 | !cas((uint32_t *)Lock, UNSET, SET, atomic::relaxed, atomic::relaxed)) { |
175 | __builtin_amdgcn_s_sleep(32); |
176 | } |
177 | fenceKernel(atomic::acquire); |
178 | } |
179 | } |
180 | |
181 | #endif |
182 | ///} |
183 | |
184 | /// NVPTX Implementation |
185 | /// |
186 | ///{ |
187 | #ifdef __NVPTX__ |
188 | |
189 | uint32_t atomicInc(uint32_t *Address, uint32_t Val, atomic::OrderingTy Ordering, |
190 | atomic::MemScopeTy MemScope) { |
191 | return __nvvm_atom_inc_gen_ui(Address, Val); |
192 | } |
193 | |
194 | void namedBarrierInit() {} |
195 | |
196 | void namedBarrier() { |
197 | uint32_t NumThreads = omp_get_num_threads(); |
198 | ASSERT(NumThreads % 32 == 0, nullptr); |
199 | |
200 | // The named barrier for active parallel threads of a team in an L1 parallel |
201 | // region to synchronize with each other. |
202 | constexpr int BarrierNo = 7; |
203 | __nvvm_barrier_sync_cnt(BarrierNo, NumThreads); |
204 | } |
205 | |
206 | void fenceTeam(atomic::OrderingTy) { __nvvm_membar_cta(); } |
207 | |
208 | void fenceKernel(atomic::OrderingTy) { __nvvm_membar_gl(); } |
209 | |
210 | void fenceSystem(atomic::OrderingTy) { __nvvm_membar_sys(); } |
211 | |
212 | void syncWarp(__kmpc_impl_lanemask_t Mask) { __nvvm_bar_warp_sync(Mask); } |
213 | |
214 | void syncThreads(atomic::OrderingTy Ordering) { |
215 | constexpr int BarrierNo = 8; |
216 | __nvvm_barrier_sync(BarrierNo); |
217 | } |
218 | |
219 | void syncThreadsAligned(atomic::OrderingTy Ordering) { __syncthreads(); } |
220 | |
221 | constexpr uint32_t OMP_SPIN = 1000; |
222 | constexpr uint32_t UNSET = 0; |
223 | constexpr uint32_t SET = 1; |
224 | |
225 | // TODO: This seems to hide a bug in the declare variant handling. If it is |
226 | // called before it is defined |
227 | // here the overload won't happen. Investigate lalter! |
228 | void unsetLock(omp_lock_t *Lock) { |
229 | (void)atomicExchange((uint32_t *)Lock, UNSET, atomic::seq_cst); |
230 | } |
231 | |
232 | int testLock(omp_lock_t *Lock) { |
233 | return atomic::add((uint32_t *)Lock, 0u, atomic::seq_cst); |
234 | } |
235 | |
236 | void initLock(omp_lock_t *Lock) { unsetLock(Lock); } |
237 | |
238 | void destroyLock(omp_lock_t *Lock) { unsetLock(Lock); } |
239 | |
240 | void setLock(omp_lock_t *Lock) { |
241 | // TODO: not sure spinning is a good idea here.. |
242 | while (atomic::cas((uint32_t *)Lock, UNSET, SET, atomic::seq_cst, |
243 | atomic::seq_cst) != UNSET) { |
244 | int32_t start = __nvvm_read_ptx_sreg_clock(); |
245 | int32_t now; |
246 | for (;;) { |
247 | now = __nvvm_read_ptx_sreg_clock(); |
248 | int32_t cycles = now > start ? now - start : now + (0xffffffff - start); |
249 | if (cycles >= OMP_SPIN * mapping::getBlockIdInKernel()) { |
250 | break; |
251 | } |
252 | } |
253 | } // wait for 0 to be the read value |
254 | } |
255 | |
256 | void unsetCriticalLock(omp_lock_t *Lock) { unsetLock(Lock); } |
257 | |
258 | void setCriticalLock(omp_lock_t *Lock) { setLock(Lock); } |
259 | |
260 | #endif |
261 | ///} |
262 | |
263 | } // namespace impl |
264 | |
265 | void synchronize::init(bool IsSPMD) { |
266 | if (!IsSPMD) |
267 | impl::namedBarrierInit(); |
268 | } |
269 | |
270 | void synchronize::warp(LaneMaskTy Mask) { impl::syncWarp(Mask); } |
271 | |
272 | void synchronize::threads(atomic::OrderingTy Ordering) { |
273 | impl::syncThreads(Ordering); |
274 | } |
275 | |
276 | void synchronize::threadsAligned(atomic::OrderingTy Ordering) { |
277 | impl::syncThreadsAligned(Ordering); |
278 | } |
279 | |
280 | void fence::team(atomic::OrderingTy Ordering) { impl::fenceTeam(Ordering); } |
281 | |
282 | void fence::kernel(atomic::OrderingTy Ordering) { impl::fenceKernel(Ordering); } |
283 | |
284 | void fence::system(atomic::OrderingTy Ordering) { impl::fenceSystem(Ordering); } |
285 | |
286 | uint32_t atomic::inc(uint32_t *Addr, uint32_t V, atomic::OrderingTy Ordering, |
287 | atomic::MemScopeTy MemScope) { |
288 | return impl::atomicInc(Addr, V, Ordering, MemScope); |
289 | } |
290 | |
291 | void unsetCriticalLock(omp_lock_t *Lock) { impl::unsetLock(Lock); } |
292 | |
293 | void setCriticalLock(omp_lock_t *Lock) { impl::setLock(Lock); } |
294 | |
295 | extern "C"{ |
296 | void __kmpc_ordered(IdentTy *Loc, int32_t TId) {} |
297 | |
298 | void __kmpc_end_ordered(IdentTy *Loc, int32_t TId) {} |
299 | |
300 | int32_t __kmpc_cancel_barrier(IdentTy *Loc, int32_t TId) { |
301 | __kmpc_barrier(Loc, TId); |
302 | return 0; |
303 | } |
304 | |
305 | void __kmpc_barrier(IdentTy *Loc, int32_t TId) { |
306 | if (mapping::isSPMDMode()) |
307 | return __kmpc_barrier_simple_spmd(Loc, TId); |
308 | |
309 | // Generic parallel regions are run with multiple of the warp size or single |
310 | // threaded, in the latter case we need to stop here. |
311 | if (omp_get_num_threads() == 1) |
312 | return __kmpc_flush(Loc); |
313 | |
314 | impl::namedBarrier(); |
315 | } |
316 | |
317 | [[clang::noinline]] void __kmpc_barrier_simple_spmd(IdentTy *Loc, int32_t TId) { |
318 | synchronize::threadsAligned(atomic::OrderingTy::seq_cst); |
319 | } |
320 | |
321 | [[clang::noinline]] void __kmpc_barrier_simple_generic(IdentTy *Loc, |
322 | int32_t TId) { |
323 | synchronize::threads(atomic::OrderingTy::seq_cst); |
324 | } |
325 | |
326 | int32_t __kmpc_master(IdentTy *Loc, int32_t TId) { |
327 | return omp_get_thread_num() == 0; |
328 | } |
329 | |
330 | void __kmpc_end_master(IdentTy *Loc, int32_t TId) {} |
331 | |
332 | int32_t __kmpc_masked(IdentTy *Loc, int32_t TId, int32_t Filter) { |
333 | return omp_get_thread_num() == Filter; |
334 | } |
335 | |
336 | void __kmpc_end_masked(IdentTy *Loc, int32_t TId) {} |
337 | |
338 | int32_t __kmpc_single(IdentTy *Loc, int32_t TId) { |
339 | return __kmpc_master(Loc, TId); |
340 | } |
341 | |
342 | void __kmpc_end_single(IdentTy *Loc, int32_t TId) { |
343 | // The barrier is explicitly called. |
344 | } |
345 | |
346 | void __kmpc_flush(IdentTy *Loc) { fence::kernel(atomic::seq_cst); } |
347 | |
348 | uint64_t __kmpc_warp_active_thread_mask(void) { return mapping::activemask(); } |
349 | |
350 | void __kmpc_syncwarp(uint64_t Mask) { synchronize::warp(Mask); } |
351 | |
352 | void __kmpc_critical(IdentTy *Loc, int32_t TId, CriticalNameTy *Name) { |
353 | impl::setCriticalLock(reinterpret_cast<omp_lock_t *>(Name)); |
354 | } |
355 | |
356 | void __kmpc_end_critical(IdentTy *Loc, int32_t TId, CriticalNameTy *Name) { |
357 | impl::unsetCriticalLock(reinterpret_cast<omp_lock_t *>(Name)); |
358 | } |
359 | |
360 | void omp_init_lock(omp_lock_t *Lock) { impl::initLock(Lock); } |
361 | |
362 | void omp_destroy_lock(omp_lock_t *Lock) { impl::destroyLock(Lock); } |
363 | |
364 | void omp_set_lock(omp_lock_t *Lock) { impl::setLock(Lock); } |
365 | |
366 | void omp_unset_lock(omp_lock_t *Lock) { impl::unsetLock(Lock); } |
367 | |
368 | int omp_test_lock(omp_lock_t *Lock) { return impl::testLock(Lock); } |
369 | |
370 | void ompx_sync_block(int Ordering) { |
371 | impl::syncThreadsAligned(atomic::OrderingTy(Ordering)); |
372 | } |
373 | void ompx_sync_block_acq_rel() { |
374 | impl::syncThreadsAligned(atomic::OrderingTy::acq_rel); |
375 | } |
376 | void ompx_sync_block_divergent(int Ordering) { |
377 | impl::syncThreads(atomic::OrderingTy(Ordering)); |
378 | } |
379 | } // extern "C" |
380 |
Definitions
- unsetCriticalLock
- setCriticalLock
- __kmpc_ordered
- __kmpc_end_ordered
- __kmpc_cancel_barrier
- __kmpc_barrier
- __kmpc_barrier_simple_spmd
- __kmpc_barrier_simple_generic
- __kmpc_master
- __kmpc_end_master
- __kmpc_masked
- __kmpc_end_masked
- __kmpc_single
- __kmpc_end_single
- __kmpc_flush
- __kmpc_warp_active_thread_mask
- __kmpc_syncwarp
- __kmpc_critical
- __kmpc_end_critical
- omp_init_lock
- omp_destroy_lock
- omp_set_lock
- omp_unset_lock
- omp_test_lock
- ompx_sync_block
- ompx_sync_block_acq_rel
Improve your Profiling and Debugging skills
Find out more