1 | //===- Synchronization.cpp - OpenMP Device synchronization API ---- c++ -*-===// |
2 | // |
3 | // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. |
4 | // See https://llvm.org/LICENSE.txt for license information. |
5 | // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception |
6 | // |
7 | //===----------------------------------------------------------------------===// |
8 | // |
9 | // Include all synchronization. |
10 | // |
11 | //===----------------------------------------------------------------------===// |
12 | |
13 | #include "Synchronization.h" |
14 | |
15 | #include "Debug.h" |
16 | #include "Interface.h" |
17 | #include "Mapping.h" |
18 | #include "State.h" |
19 | #include "Types.h" |
20 | #include "Utils.h" |
21 | |
22 | #pragma omp begin declare target device_type(nohost) |
23 | |
24 | using namespace ompx; |
25 | |
26 | namespace impl { |
27 | |
28 | /// Atomics |
29 | /// |
30 | ///{ |
31 | /// NOTE: This function needs to be implemented by every target. |
32 | uint32_t atomicInc(uint32_t *Address, uint32_t Val, atomic::OrderingTy Ordering, |
33 | atomic::MemScopeTy MemScope); |
34 | |
35 | template <typename Ty> |
36 | Ty atomicAdd(Ty *Address, Ty Val, atomic::OrderingTy Ordering) { |
37 | return __scoped_atomic_fetch_add(Address, Val, Ordering, |
38 | __MEMORY_SCOPE_DEVICE); |
39 | } |
40 | |
41 | template <typename Ty> |
42 | Ty atomicMul(Ty *Address, Ty V, atomic::OrderingTy Ordering) { |
43 | Ty TypedCurrentVal, TypedResultVal, TypedNewVal; |
44 | bool Success; |
45 | do { |
46 | TypedCurrentVal = atomic::load(Address, Ordering); |
47 | TypedNewVal = TypedCurrentVal * V; |
48 | Success = atomic::cas(Address, TypedCurrentVal, TypedNewVal, Ordering, |
49 | atomic::relaxed); |
50 | } while (!Success); |
51 | return TypedResultVal; |
52 | } |
53 | |
54 | template <typename Ty> Ty atomicLoad(Ty *Address, atomic::OrderingTy Ordering) { |
55 | return atomicAdd(Address, Ty(0), Ordering); |
56 | } |
57 | |
58 | template <typename Ty> |
59 | void atomicStore(Ty *Address, Ty Val, atomic::OrderingTy Ordering) { |
60 | __scoped_atomic_store_n(Address, Val, Ordering, __MEMORY_SCOPE_DEVICE); |
61 | } |
62 | |
63 | template <typename Ty> |
64 | bool atomicCAS(Ty *Address, Ty ExpectedV, Ty DesiredV, |
65 | atomic::OrderingTy OrderingSucc, |
66 | atomic::OrderingTy OrderingFail) { |
67 | return __scoped_atomic_compare_exchange(Address, &ExpectedV, &DesiredV, false, |
68 | OrderingSucc, OrderingFail, |
69 | __MEMORY_SCOPE_DEVICE); |
70 | } |
71 | |
72 | template <typename Ty> |
73 | Ty atomicMin(Ty *Address, Ty Val, atomic::OrderingTy Ordering) { |
74 | return __scoped_atomic_fetch_min(Address, Val, Ordering, |
75 | __MEMORY_SCOPE_DEVICE); |
76 | } |
77 | |
78 | template <typename Ty> |
79 | Ty atomicMax(Ty *Address, Ty Val, atomic::OrderingTy Ordering) { |
80 | return __scoped_atomic_fetch_max(Address, Val, Ordering, |
81 | __MEMORY_SCOPE_DEVICE); |
82 | } |
83 | |
84 | // TODO: Implement this with __atomic_fetch_max and remove the duplication. |
85 | template <typename Ty, typename STy, typename UTy> |
86 | Ty atomicMinFP(Ty *Address, Ty Val, atomic::OrderingTy Ordering) { |
87 | if (Val >= 0) |
88 | return atomicMin((STy *)Address, utils::convertViaPun<STy>(Val), Ordering); |
89 | return atomicMax((UTy *)Address, utils::convertViaPun<UTy>(Val), Ordering); |
90 | } |
91 | |
92 | template <typename Ty, typename STy, typename UTy> |
93 | Ty atomicMaxFP(Ty *Address, Ty Val, atomic::OrderingTy Ordering) { |
94 | if (Val >= 0) |
95 | return atomicMax((STy *)Address, utils::convertViaPun<STy>(Val), Ordering); |
96 | return atomicMin((UTy *)Address, utils::convertViaPun<UTy>(Val), Ordering); |
97 | } |
98 | |
99 | template <typename Ty> |
100 | Ty atomicOr(Ty *Address, Ty Val, atomic::OrderingTy Ordering) { |
101 | return __scoped_atomic_fetch_or(Address, Val, Ordering, |
102 | __MEMORY_SCOPE_DEVICE); |
103 | } |
104 | |
105 | template <typename Ty> |
106 | Ty atomicAnd(Ty *Address, Ty Val, atomic::OrderingTy Ordering) { |
107 | return __scoped_atomic_fetch_and(Address, Val, Ordering, |
108 | __MEMORY_SCOPE_DEVICE); |
109 | } |
110 | |
111 | template <typename Ty> |
112 | Ty atomicXOr(Ty *Address, Ty Val, atomic::OrderingTy Ordering) { |
113 | return __scoped_atomic_fetch_xor(Address, Val, Ordering, |
114 | __MEMORY_SCOPE_DEVICE); |
115 | } |
116 | |
117 | uint32_t atomicExchange(uint32_t *Address, uint32_t Val, |
118 | atomic::OrderingTy Ordering) { |
119 | uint32_t R; |
120 | __scoped_atomic_exchange(Address, &Val, &R, Ordering, __MEMORY_SCOPE_DEVICE); |
121 | return R; |
122 | } |
123 | ///} |
124 | |
125 | // Forward declarations defined to be defined for AMDGCN and NVPTX. |
126 | uint32_t atomicInc(uint32_t *A, uint32_t V, atomic::OrderingTy Ordering, |
127 | atomic::MemScopeTy MemScope); |
128 | void namedBarrierInit(); |
129 | void namedBarrier(); |
130 | void fenceTeam(atomic::OrderingTy Ordering); |
131 | void fenceKernel(atomic::OrderingTy Ordering); |
132 | void fenceSystem(atomic::OrderingTy Ordering); |
133 | void syncWarp(__kmpc_impl_lanemask_t); |
134 | void syncThreads(atomic::OrderingTy Ordering); |
135 | void syncThreadsAligned(atomic::OrderingTy Ordering) { syncThreads(Ordering); } |
136 | void unsetLock(omp_lock_t *); |
137 | int testLock(omp_lock_t *); |
138 | void initLock(omp_lock_t *); |
139 | void destroyLock(omp_lock_t *); |
140 | void setLock(omp_lock_t *); |
141 | void unsetCriticalLock(omp_lock_t *); |
142 | void setCriticalLock(omp_lock_t *); |
143 | |
144 | /// AMDGCN Implementation |
145 | /// |
146 | ///{ |
147 | #pragma omp begin declare variant match(device = {arch(amdgcn)}) |
148 | |
149 | uint32_t atomicInc(uint32_t *A, uint32_t V, atomic::OrderingTy Ordering, |
150 | atomic::MemScopeTy MemScope) { |
151 | // builtin_amdgcn_atomic_inc32 should expand to this switch when |
152 | // passed a runtime value, but does not do so yet. Workaround here. |
153 | |
154 | #define ScopeSwitch(ORDER) \ |
155 | switch (MemScope) { \ |
156 | case atomic::MemScopeTy::all: \ |
157 | return __builtin_amdgcn_atomic_inc32(A, V, ORDER, ""); \ |
158 | case atomic::MemScopeTy::device: \ |
159 | return __builtin_amdgcn_atomic_inc32(A, V, ORDER, "agent"); \ |
160 | case atomic::MemScopeTy::cgroup: \ |
161 | return __builtin_amdgcn_atomic_inc32(A, V, ORDER, "workgroup"); \ |
162 | } |
163 | |
164 | #define Case(ORDER) \ |
165 | case ORDER: \ |
166 | ScopeSwitch(ORDER) |
167 | |
168 | switch (Ordering) { |
169 | default: |
170 | __builtin_unreachable(); |
171 | Case(atomic::relaxed); |
172 | Case(atomic::aquire); |
173 | Case(atomic::release); |
174 | Case(atomic::acq_rel); |
175 | Case(atomic::seq_cst); |
176 | #undef Case |
177 | #undef ScopeSwitch |
178 | } |
179 | } |
180 | |
181 | uint32_t SHARED(namedBarrierTracker); |
182 | |
183 | void namedBarrierInit() { |
184 | // Don't have global ctors, and shared memory is not zero init |
185 | atomic::store(&namedBarrierTracker, 0u, atomic::release); |
186 | } |
187 | |
188 | void namedBarrier() { |
189 | uint32_t NumThreads = omp_get_num_threads(); |
190 | // assert(NumThreads % 32 == 0); |
191 | |
192 | uint32_t WarpSize = mapping::getWarpSize(); |
193 | uint32_t NumWaves = NumThreads / WarpSize; |
194 | |
195 | fence::team(atomic::aquire); |
196 | |
197 | // named barrier implementation for amdgcn. |
198 | // Uses two 16 bit unsigned counters. One for the number of waves to have |
199 | // reached the barrier, and one to count how many times the barrier has been |
200 | // passed. These are packed in a single atomically accessed 32 bit integer. |
201 | // Low bits for the number of waves, assumed zero before this call. |
202 | // High bits to count the number of times the barrier has been passed. |
203 | |
204 | // precondition: NumWaves != 0; |
205 | // invariant: NumWaves * WarpSize == NumThreads; |
206 | // precondition: NumWaves < 0xffffu; |
207 | |
208 | // Increment the low 16 bits once, using the lowest active thread. |
209 | if (mapping::isLeaderInWarp()) { |
210 | uint32_t load = atomic::add(&namedBarrierTracker, 1, |
211 | atomic::relaxed); // commutative |
212 | |
213 | // Record the number of times the barrier has been passed |
214 | uint32_t generation = load & 0xffff0000u; |
215 | |
216 | if ((load & 0x0000ffffu) == (NumWaves - 1)) { |
217 | // Reached NumWaves in low bits so this is the last wave. |
218 | // Set low bits to zero and increment high bits |
219 | load += 0x00010000u; // wrap is safe |
220 | load &= 0xffff0000u; // because bits zeroed second |
221 | |
222 | // Reset the wave counter and release the waiting waves |
223 | atomic::store(&namedBarrierTracker, load, atomic::relaxed); |
224 | } else { |
225 | // more waves still to go, spin until generation counter changes |
226 | do { |
227 | __builtin_amdgcn_s_sleep(0); |
228 | load = atomic::load(&namedBarrierTracker, atomic::relaxed); |
229 | } while ((load & 0xffff0000u) == generation); |
230 | } |
231 | } |
232 | fence::team(atomic::release); |
233 | } |
234 | |
235 | // sema checking of amdgcn_fence is aggressive. Intention is to patch clang |
236 | // so that it is usable within a template environment and so that a runtime |
237 | // value of the memory order is expanded to this switch within clang/llvm. |
238 | void fenceTeam(atomic::OrderingTy Ordering) { |
239 | switch (Ordering) { |
240 | default: |
241 | __builtin_unreachable(); |
242 | case atomic::aquire: |
243 | return __builtin_amdgcn_fence(atomic::aquire, "workgroup" ); |
244 | case atomic::release: |
245 | return __builtin_amdgcn_fence(atomic::release, "workgroup" ); |
246 | case atomic::acq_rel: |
247 | return __builtin_amdgcn_fence(atomic::acq_rel, "workgroup" ); |
248 | case atomic::seq_cst: |
249 | return __builtin_amdgcn_fence(atomic::seq_cst, "workgroup" ); |
250 | } |
251 | } |
252 | void fenceKernel(atomic::OrderingTy Ordering) { |
253 | switch (Ordering) { |
254 | default: |
255 | __builtin_unreachable(); |
256 | case atomic::aquire: |
257 | return __builtin_amdgcn_fence(atomic::aquire, "agent" ); |
258 | case atomic::release: |
259 | return __builtin_amdgcn_fence(atomic::release, "agent" ); |
260 | case atomic::acq_rel: |
261 | return __builtin_amdgcn_fence(atomic::acq_rel, "agent" ); |
262 | case atomic::seq_cst: |
263 | return __builtin_amdgcn_fence(atomic::seq_cst, "agent" ); |
264 | } |
265 | } |
266 | void fenceSystem(atomic::OrderingTy Ordering) { |
267 | switch (Ordering) { |
268 | default: |
269 | __builtin_unreachable(); |
270 | case atomic::aquire: |
271 | return __builtin_amdgcn_fence(atomic::aquire, "" ); |
272 | case atomic::release: |
273 | return __builtin_amdgcn_fence(atomic::release, "" ); |
274 | case atomic::acq_rel: |
275 | return __builtin_amdgcn_fence(atomic::acq_rel, "" ); |
276 | case atomic::seq_cst: |
277 | return __builtin_amdgcn_fence(atomic::seq_cst, "" ); |
278 | } |
279 | } |
280 | |
281 | void syncWarp(__kmpc_impl_lanemask_t) { |
282 | // This is a no-op on current AMDGPU hardware but it is used by the optimizer |
283 | // to enforce convergent behaviour between control flow graphs. |
284 | __builtin_amdgcn_wave_barrier(); |
285 | } |
286 | |
287 | void syncThreads(atomic::OrderingTy Ordering) { |
288 | if (Ordering != atomic::relaxed) |
289 | fenceTeam(Ordering == atomic::acq_rel ? atomic::release : atomic::seq_cst); |
290 | |
291 | __builtin_amdgcn_s_barrier(); |
292 | |
293 | if (Ordering != atomic::relaxed) |
294 | fenceTeam(Ordering == atomic::acq_rel ? atomic::aquire : atomic::seq_cst); |
295 | } |
296 | void syncThreadsAligned(atomic::OrderingTy Ordering) { syncThreads(Ordering); } |
297 | |
298 | // TODO: Don't have wavefront lane locks. Possibly can't have them. |
299 | void unsetLock(omp_lock_t *) { __builtin_trap(); } |
300 | int testLock(omp_lock_t *) { __builtin_trap(); } |
301 | void initLock(omp_lock_t *) { __builtin_trap(); } |
302 | void destroyLock(omp_lock_t *) { __builtin_trap(); } |
303 | void setLock(omp_lock_t *) { __builtin_trap(); } |
304 | |
305 | constexpr uint32_t UNSET = 0; |
306 | constexpr uint32_t SET = 1; |
307 | |
308 | void unsetCriticalLock(omp_lock_t *Lock) { |
309 | (void)atomicExchange((uint32_t *)Lock, UNSET, atomic::acq_rel); |
310 | } |
311 | |
312 | void setCriticalLock(omp_lock_t *Lock) { |
313 | uint64_t LowestActiveThread = utils::ffs(mapping::activemask()) - 1; |
314 | if (mapping::getThreadIdInWarp() == LowestActiveThread) { |
315 | fenceKernel(atomic::release); |
316 | while (!atomicCAS((uint32_t *)Lock, UNSET, SET, atomic::relaxed, |
317 | atomic::relaxed)) { |
318 | __builtin_amdgcn_s_sleep(32); |
319 | } |
320 | fenceKernel(atomic::aquire); |
321 | } |
322 | } |
323 | |
324 | #pragma omp end declare variant |
325 | ///} |
326 | |
327 | /// NVPTX Implementation |
328 | /// |
329 | ///{ |
330 | #pragma omp begin declare variant match( \ |
331 | device = {arch(nvptx, nvptx64)}, \ |
332 | implementation = {extension(match_any)}) |
333 | |
334 | uint32_t atomicInc(uint32_t *Address, uint32_t Val, atomic::OrderingTy Ordering, |
335 | atomic::MemScopeTy MemScope) { |
336 | return __nvvm_atom_inc_gen_ui(Address, Val); |
337 | } |
338 | |
339 | void namedBarrierInit() {} |
340 | |
341 | void namedBarrier() { |
342 | uint32_t NumThreads = omp_get_num_threads(); |
343 | ASSERT(NumThreads % 32 == 0, nullptr); |
344 | |
345 | // The named barrier for active parallel threads of a team in an L1 parallel |
346 | // region to synchronize with each other. |
347 | constexpr int BarrierNo = 7; |
348 | __nvvm_barrier_sync_cnt(BarrierNo, NumThreads); |
349 | } |
350 | |
351 | void fenceTeam(atomic::OrderingTy) { __nvvm_membar_cta(); } |
352 | |
353 | void fenceKernel(atomic::OrderingTy) { __nvvm_membar_gl(); } |
354 | |
355 | void fenceSystem(atomic::OrderingTy) { __nvvm_membar_sys(); } |
356 | |
357 | void syncWarp(__kmpc_impl_lanemask_t Mask) { __nvvm_bar_warp_sync(Mask); } |
358 | |
359 | void syncThreads(atomic::OrderingTy Ordering) { |
360 | constexpr int BarrierNo = 8; |
361 | __nvvm_barrier_sync(BarrierNo); |
362 | } |
363 | |
364 | void syncThreadsAligned(atomic::OrderingTy Ordering) { __syncthreads(); } |
365 | |
366 | constexpr uint32_t OMP_SPIN = 1000; |
367 | constexpr uint32_t UNSET = 0; |
368 | constexpr uint32_t SET = 1; |
369 | |
370 | // TODO: This seems to hide a bug in the declare variant handling. If it is |
371 | // called before it is defined |
372 | // here the overload won't happen. Investigate lalter! |
373 | void unsetLock(omp_lock_t *Lock) { |
374 | (void)atomicExchange((uint32_t *)Lock, UNSET, atomic::seq_cst); |
375 | } |
376 | |
377 | int testLock(omp_lock_t *Lock) { |
378 | return atomicAdd((uint32_t *)Lock, 0u, atomic::seq_cst); |
379 | } |
380 | |
381 | void initLock(omp_lock_t *Lock) { unsetLock(Lock); } |
382 | |
383 | void destroyLock(omp_lock_t *Lock) { unsetLock(Lock); } |
384 | |
385 | void setLock(omp_lock_t *Lock) { |
386 | // TODO: not sure spinning is a good idea here.. |
387 | while (atomicCAS((uint32_t *)Lock, UNSET, SET, atomic::seq_cst, |
388 | atomic::seq_cst) != UNSET) { |
389 | int32_t start = __nvvm_read_ptx_sreg_clock(); |
390 | int32_t now; |
391 | for (;;) { |
392 | now = __nvvm_read_ptx_sreg_clock(); |
393 | int32_t cycles = now > start ? now - start : now + (0xffffffff - start); |
394 | if (cycles >= OMP_SPIN * mapping::getBlockIdInKernel()) { |
395 | break; |
396 | } |
397 | } |
398 | } // wait for 0 to be the read value |
399 | } |
400 | |
401 | #pragma omp end declare variant |
402 | ///} |
403 | |
404 | } // namespace impl |
405 | |
406 | void synchronize::init(bool IsSPMD) { |
407 | if (!IsSPMD) |
408 | impl::namedBarrierInit(); |
409 | } |
410 | |
411 | void synchronize::warp(LaneMaskTy Mask) { impl::syncWarp(Mask); } |
412 | |
413 | void synchronize::threads(atomic::OrderingTy Ordering) { |
414 | impl::syncThreads(Ordering); |
415 | } |
416 | |
417 | void synchronize::threadsAligned(atomic::OrderingTy Ordering) { |
418 | impl::syncThreadsAligned(Ordering); |
419 | } |
420 | |
421 | void fence::team(atomic::OrderingTy Ordering) { impl::fenceTeam(Ordering); } |
422 | |
423 | void fence::kernel(atomic::OrderingTy Ordering) { impl::fenceKernel(Ordering); } |
424 | |
425 | void fence::system(atomic::OrderingTy Ordering) { impl::fenceSystem(Ordering); } |
426 | |
427 | #define ATOMIC_COMMON_OP(TY) \ |
428 | TY atomic::add(TY *Addr, TY V, atomic::OrderingTy Ordering) { \ |
429 | return impl::atomicAdd(Addr, V, Ordering); \ |
430 | } \ |
431 | TY atomic::mul(TY *Addr, TY V, atomic::OrderingTy Ordering) { \ |
432 | return impl::atomicMul(Addr, V, Ordering); \ |
433 | } \ |
434 | TY atomic::load(TY *Addr, atomic::OrderingTy Ordering) { \ |
435 | return impl::atomicLoad(Addr, Ordering); \ |
436 | } \ |
437 | bool atomic::cas(TY *Addr, TY ExpectedV, TY DesiredV, \ |
438 | atomic::OrderingTy OrderingSucc, \ |
439 | atomic::OrderingTy OrderingFail) { \ |
440 | return impl::atomicCAS(Addr, ExpectedV, DesiredV, OrderingSucc, \ |
441 | OrderingFail); \ |
442 | } |
443 | |
444 | #define ATOMIC_FP_ONLY_OP(TY, STY, UTY) \ |
445 | TY atomic::min(TY *Addr, TY V, atomic::OrderingTy Ordering) { \ |
446 | return impl::atomicMinFP<TY, STY, UTY>(Addr, V, Ordering); \ |
447 | } \ |
448 | TY atomic::max(TY *Addr, TY V, atomic::OrderingTy Ordering) { \ |
449 | return impl::atomicMaxFP<TY, STY, UTY>(Addr, V, Ordering); \ |
450 | } \ |
451 | void atomic::store(TY *Addr, TY V, atomic::OrderingTy Ordering) { \ |
452 | impl::atomicStore(reinterpret_cast<UTY *>(Addr), \ |
453 | utils::convertViaPun<UTY>(V), Ordering); \ |
454 | } |
455 | |
456 | #define ATOMIC_INT_ONLY_OP(TY) \ |
457 | TY atomic::min(TY *Addr, TY V, atomic::OrderingTy Ordering) { \ |
458 | return impl::atomicMin<TY>(Addr, V, Ordering); \ |
459 | } \ |
460 | TY atomic::max(TY *Addr, TY V, atomic::OrderingTy Ordering) { \ |
461 | return impl::atomicMax<TY>(Addr, V, Ordering); \ |
462 | } \ |
463 | TY atomic::bit_or(TY *Addr, TY V, atomic::OrderingTy Ordering) { \ |
464 | return impl::atomicOr(Addr, V, Ordering); \ |
465 | } \ |
466 | TY atomic::bit_and(TY *Addr, TY V, atomic::OrderingTy Ordering) { \ |
467 | return impl::atomicAnd(Addr, V, Ordering); \ |
468 | } \ |
469 | TY atomic::bit_xor(TY *Addr, TY V, atomic::OrderingTy Ordering) { \ |
470 | return impl::atomicXOr(Addr, V, Ordering); \ |
471 | } \ |
472 | void atomic::store(TY *Addr, TY V, atomic::OrderingTy Ordering) { \ |
473 | impl::atomicStore(Addr, V, Ordering); \ |
474 | } |
475 | |
476 | #define ATOMIC_FP_OP(TY, STY, UTY) \ |
477 | ATOMIC_FP_ONLY_OP(TY, STY, UTY) \ |
478 | ATOMIC_COMMON_OP(TY) |
479 | |
480 | #define ATOMIC_INT_OP(TY) \ |
481 | ATOMIC_INT_ONLY_OP(TY) \ |
482 | ATOMIC_COMMON_OP(TY) |
483 | |
484 | // This needs to be kept in sync with the header. Also the reason we don't use |
485 | // templates here. |
486 | ATOMIC_INT_OP(int8_t) |
487 | ATOMIC_INT_OP(int16_t) |
488 | ATOMIC_INT_OP(int32_t) |
489 | ATOMIC_INT_OP(int64_t) |
490 | ATOMIC_INT_OP(uint8_t) |
491 | ATOMIC_INT_OP(uint16_t) |
492 | ATOMIC_INT_OP(uint32_t) |
493 | ATOMIC_INT_OP(uint64_t) |
494 | ATOMIC_FP_OP(float, int32_t, uint32_t) |
495 | ATOMIC_FP_OP(double, int64_t, uint64_t) |
496 | |
497 | #undef ATOMIC_INT_ONLY_OP |
498 | #undef ATOMIC_FP_ONLY_OP |
499 | #undef ATOMIC_COMMON_OP |
500 | #undef ATOMIC_INT_OP |
501 | #undef ATOMIC_FP_OP |
502 | |
503 | uint32_t atomic::inc(uint32_t *Addr, uint32_t V, atomic::OrderingTy Ordering, |
504 | atomic::MemScopeTy MemScope) { |
505 | return impl::atomicInc(Addr, V, Ordering, MemScope); |
506 | } |
507 | |
508 | void unsetCriticalLock(omp_lock_t *Lock) { impl::unsetLock(Lock); } |
509 | |
510 | void setCriticalLock(omp_lock_t *Lock) { impl::setLock(Lock); } |
511 | |
512 | extern "C" { |
513 | void __kmpc_ordered(IdentTy *Loc, int32_t TId) {} |
514 | |
515 | void __kmpc_end_ordered(IdentTy *Loc, int32_t TId) {} |
516 | |
517 | int32_t __kmpc_cancel_barrier(IdentTy *Loc, int32_t TId) { |
518 | __kmpc_barrier(Loc, TId); |
519 | return 0; |
520 | } |
521 | |
522 | void __kmpc_barrier(IdentTy *Loc, int32_t TId) { |
523 | if (mapping::isMainThreadInGenericMode()) |
524 | return __kmpc_flush(Loc); |
525 | |
526 | if (mapping::isSPMDMode()) |
527 | return __kmpc_barrier_simple_spmd(Loc, TId); |
528 | |
529 | impl::namedBarrier(); |
530 | } |
531 | |
532 | [[clang::noinline]] void __kmpc_barrier_simple_spmd(IdentTy *Loc, int32_t TId) { |
533 | synchronize::threadsAligned(atomic::OrderingTy::seq_cst); |
534 | } |
535 | |
536 | [[clang::noinline]] void __kmpc_barrier_simple_generic(IdentTy *Loc, |
537 | int32_t TId) { |
538 | synchronize::threads(atomic::OrderingTy::seq_cst); |
539 | } |
540 | |
541 | int32_t __kmpc_master(IdentTy *Loc, int32_t TId) { |
542 | return omp_get_thread_num() == 0; |
543 | } |
544 | |
545 | void __kmpc_end_master(IdentTy *Loc, int32_t TId) {} |
546 | |
547 | int32_t __kmpc_masked(IdentTy *Loc, int32_t TId, int32_t Filter) { |
548 | return omp_get_thread_num() == Filter; |
549 | } |
550 | |
551 | void __kmpc_end_masked(IdentTy *Loc, int32_t TId) {} |
552 | |
553 | int32_t __kmpc_single(IdentTy *Loc, int32_t TId) { |
554 | return __kmpc_master(Loc, TId); |
555 | } |
556 | |
557 | void __kmpc_end_single(IdentTy *Loc, int32_t TId) { |
558 | // The barrier is explicitly called. |
559 | } |
560 | |
561 | void __kmpc_flush(IdentTy *Loc) { fence::kernel(atomic::seq_cst); } |
562 | |
563 | uint64_t __kmpc_warp_active_thread_mask(void) { return mapping::activemask(); } |
564 | |
565 | void __kmpc_syncwarp(uint64_t Mask) { synchronize::warp(Mask); } |
566 | |
567 | void __kmpc_critical(IdentTy *Loc, int32_t TId, CriticalNameTy *Name) { |
568 | impl::setCriticalLock(reinterpret_cast<omp_lock_t *>(Name)); |
569 | } |
570 | |
571 | void __kmpc_end_critical(IdentTy *Loc, int32_t TId, CriticalNameTy *Name) { |
572 | impl::unsetCriticalLock(reinterpret_cast<omp_lock_t *>(Name)); |
573 | } |
574 | |
575 | void omp_init_lock(omp_lock_t *Lock) { impl::initLock(Lock); } |
576 | |
577 | void omp_destroy_lock(omp_lock_t *Lock) { impl::destroyLock(Lock); } |
578 | |
579 | void omp_set_lock(omp_lock_t *Lock) { impl::setLock(Lock); } |
580 | |
581 | void omp_unset_lock(omp_lock_t *Lock) { impl::unsetLock(Lock); } |
582 | |
583 | int omp_test_lock(omp_lock_t *Lock) { return impl::testLock(Lock); } |
584 | |
585 | void ompx_sync_block(int Ordering) { |
586 | impl::syncThreadsAligned(atomic::OrderingTy(Ordering)); |
587 | } |
588 | void ompx_sync_block_acq_rel() { |
589 | impl::syncThreadsAligned(atomic::OrderingTy::acq_rel); |
590 | } |
591 | void ompx_sync_block_divergent(int Ordering) { |
592 | impl::syncThreads(atomic::OrderingTy(Ordering)); |
593 | } |
594 | } // extern "C" |
595 | |
596 | #pragma omp end declare target |
597 | |