1//===- Synchronization.cpp - OpenMP Device synchronization API ---- c++ -*-===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9// Include all synchronization.
10//
11//===----------------------------------------------------------------------===//
12
13#include "Synchronization.h"
14
15#include "Debug.h"
16#include "Interface.h"
17#include "Mapping.h"
18#include "State.h"
19#include "Types.h"
20#include "Utils.h"
21
22#pragma omp begin declare target device_type(nohost)
23
24using namespace ompx;
25
26namespace impl {
27
28/// Atomics
29///
30///{
31/// NOTE: This function needs to be implemented by every target.
32uint32_t atomicInc(uint32_t *Address, uint32_t Val, atomic::OrderingTy Ordering,
33 atomic::MemScopeTy MemScope);
34
35template <typename Ty>
36Ty atomicAdd(Ty *Address, Ty Val, atomic::OrderingTy Ordering) {
37 return __scoped_atomic_fetch_add(Address, Val, Ordering,
38 __MEMORY_SCOPE_DEVICE);
39}
40
41template <typename Ty>
42Ty atomicMul(Ty *Address, Ty V, atomic::OrderingTy Ordering) {
43 Ty TypedCurrentVal, TypedResultVal, TypedNewVal;
44 bool Success;
45 do {
46 TypedCurrentVal = atomic::load(Address, Ordering);
47 TypedNewVal = TypedCurrentVal * V;
48 Success = atomic::cas(Address, TypedCurrentVal, TypedNewVal, Ordering,
49 atomic::relaxed);
50 } while (!Success);
51 return TypedResultVal;
52}
53
54template <typename Ty> Ty atomicLoad(Ty *Address, atomic::OrderingTy Ordering) {
55 return atomicAdd(Address, Ty(0), Ordering);
56}
57
58template <typename Ty>
59void atomicStore(Ty *Address, Ty Val, atomic::OrderingTy Ordering) {
60 __scoped_atomic_store_n(Address, Val, Ordering, __MEMORY_SCOPE_DEVICE);
61}
62
63template <typename Ty>
64bool atomicCAS(Ty *Address, Ty ExpectedV, Ty DesiredV,
65 atomic::OrderingTy OrderingSucc,
66 atomic::OrderingTy OrderingFail) {
67 return __scoped_atomic_compare_exchange(Address, &ExpectedV, &DesiredV, false,
68 OrderingSucc, OrderingFail,
69 __MEMORY_SCOPE_DEVICE);
70}
71
72template <typename Ty>
73Ty atomicMin(Ty *Address, Ty Val, atomic::OrderingTy Ordering) {
74 return __scoped_atomic_fetch_min(Address, Val, Ordering,
75 __MEMORY_SCOPE_DEVICE);
76}
77
78template <typename Ty>
79Ty atomicMax(Ty *Address, Ty Val, atomic::OrderingTy Ordering) {
80 return __scoped_atomic_fetch_max(Address, Val, Ordering,
81 __MEMORY_SCOPE_DEVICE);
82}
83
84// TODO: Implement this with __atomic_fetch_max and remove the duplication.
85template <typename Ty, typename STy, typename UTy>
86Ty atomicMinFP(Ty *Address, Ty Val, atomic::OrderingTy Ordering) {
87 if (Val >= 0)
88 return atomicMin((STy *)Address, utils::convertViaPun<STy>(Val), Ordering);
89 return atomicMax((UTy *)Address, utils::convertViaPun<UTy>(Val), Ordering);
90}
91
92template <typename Ty, typename STy, typename UTy>
93Ty atomicMaxFP(Ty *Address, Ty Val, atomic::OrderingTy Ordering) {
94 if (Val >= 0)
95 return atomicMax((STy *)Address, utils::convertViaPun<STy>(Val), Ordering);
96 return atomicMin((UTy *)Address, utils::convertViaPun<UTy>(Val), Ordering);
97}
98
99template <typename Ty>
100Ty atomicOr(Ty *Address, Ty Val, atomic::OrderingTy Ordering) {
101 return __scoped_atomic_fetch_or(Address, Val, Ordering,
102 __MEMORY_SCOPE_DEVICE);
103}
104
105template <typename Ty>
106Ty atomicAnd(Ty *Address, Ty Val, atomic::OrderingTy Ordering) {
107 return __scoped_atomic_fetch_and(Address, Val, Ordering,
108 __MEMORY_SCOPE_DEVICE);
109}
110
111template <typename Ty>
112Ty atomicXOr(Ty *Address, Ty Val, atomic::OrderingTy Ordering) {
113 return __scoped_atomic_fetch_xor(Address, Val, Ordering,
114 __MEMORY_SCOPE_DEVICE);
115}
116
117uint32_t atomicExchange(uint32_t *Address, uint32_t Val,
118 atomic::OrderingTy Ordering) {
119 uint32_t R;
120 __scoped_atomic_exchange(Address, &Val, &R, Ordering, __MEMORY_SCOPE_DEVICE);
121 return R;
122}
123///}
124
125// Forward declarations defined to be defined for AMDGCN and NVPTX.
126uint32_t atomicInc(uint32_t *A, uint32_t V, atomic::OrderingTy Ordering,
127 atomic::MemScopeTy MemScope);
128void namedBarrierInit();
129void namedBarrier();
130void fenceTeam(atomic::OrderingTy Ordering);
131void fenceKernel(atomic::OrderingTy Ordering);
132void fenceSystem(atomic::OrderingTy Ordering);
133void syncWarp(__kmpc_impl_lanemask_t);
134void syncThreads(atomic::OrderingTy Ordering);
135void syncThreadsAligned(atomic::OrderingTy Ordering) { syncThreads(Ordering); }
136void unsetLock(omp_lock_t *);
137int testLock(omp_lock_t *);
138void initLock(omp_lock_t *);
139void destroyLock(omp_lock_t *);
140void setLock(omp_lock_t *);
141void unsetCriticalLock(omp_lock_t *);
142void setCriticalLock(omp_lock_t *);
143
144/// AMDGCN Implementation
145///
146///{
147#pragma omp begin declare variant match(device = {arch(amdgcn)})
148
149uint32_t atomicInc(uint32_t *A, uint32_t V, atomic::OrderingTy Ordering,
150 atomic::MemScopeTy MemScope) {
151 // builtin_amdgcn_atomic_inc32 should expand to this switch when
152 // passed a runtime value, but does not do so yet. Workaround here.
153
154#define ScopeSwitch(ORDER) \
155 switch (MemScope) { \
156 case atomic::MemScopeTy::all: \
157 return __builtin_amdgcn_atomic_inc32(A, V, ORDER, ""); \
158 case atomic::MemScopeTy::device: \
159 return __builtin_amdgcn_atomic_inc32(A, V, ORDER, "agent"); \
160 case atomic::MemScopeTy::cgroup: \
161 return __builtin_amdgcn_atomic_inc32(A, V, ORDER, "workgroup"); \
162 }
163
164#define Case(ORDER) \
165 case ORDER: \
166 ScopeSwitch(ORDER)
167
168 switch (Ordering) {
169 default:
170 __builtin_unreachable();
171 Case(atomic::relaxed);
172 Case(atomic::aquire);
173 Case(atomic::release);
174 Case(atomic::acq_rel);
175 Case(atomic::seq_cst);
176#undef Case
177#undef ScopeSwitch
178 }
179}
180
181uint32_t SHARED(namedBarrierTracker);
182
183void namedBarrierInit() {
184 // Don't have global ctors, and shared memory is not zero init
185 atomic::store(&namedBarrierTracker, 0u, atomic::release);
186}
187
188void namedBarrier() {
189 uint32_t NumThreads = omp_get_num_threads();
190 // assert(NumThreads % 32 == 0);
191
192 uint32_t WarpSize = mapping::getWarpSize();
193 uint32_t NumWaves = NumThreads / WarpSize;
194
195 fence::team(atomic::aquire);
196
197 // named barrier implementation for amdgcn.
198 // Uses two 16 bit unsigned counters. One for the number of waves to have
199 // reached the barrier, and one to count how many times the barrier has been
200 // passed. These are packed in a single atomically accessed 32 bit integer.
201 // Low bits for the number of waves, assumed zero before this call.
202 // High bits to count the number of times the barrier has been passed.
203
204 // precondition: NumWaves != 0;
205 // invariant: NumWaves * WarpSize == NumThreads;
206 // precondition: NumWaves < 0xffffu;
207
208 // Increment the low 16 bits once, using the lowest active thread.
209 if (mapping::isLeaderInWarp()) {
210 uint32_t load = atomic::add(&namedBarrierTracker, 1,
211 atomic::relaxed); // commutative
212
213 // Record the number of times the barrier has been passed
214 uint32_t generation = load & 0xffff0000u;
215
216 if ((load & 0x0000ffffu) == (NumWaves - 1)) {
217 // Reached NumWaves in low bits so this is the last wave.
218 // Set low bits to zero and increment high bits
219 load += 0x00010000u; // wrap is safe
220 load &= 0xffff0000u; // because bits zeroed second
221
222 // Reset the wave counter and release the waiting waves
223 atomic::store(&namedBarrierTracker, load, atomic::relaxed);
224 } else {
225 // more waves still to go, spin until generation counter changes
226 do {
227 __builtin_amdgcn_s_sleep(0);
228 load = atomic::load(&namedBarrierTracker, atomic::relaxed);
229 } while ((load & 0xffff0000u) == generation);
230 }
231 }
232 fence::team(atomic::release);
233}
234
235// sema checking of amdgcn_fence is aggressive. Intention is to patch clang
236// so that it is usable within a template environment and so that a runtime
237// value of the memory order is expanded to this switch within clang/llvm.
238void fenceTeam(atomic::OrderingTy Ordering) {
239 switch (Ordering) {
240 default:
241 __builtin_unreachable();
242 case atomic::aquire:
243 return __builtin_amdgcn_fence(atomic::aquire, "workgroup");
244 case atomic::release:
245 return __builtin_amdgcn_fence(atomic::release, "workgroup");
246 case atomic::acq_rel:
247 return __builtin_amdgcn_fence(atomic::acq_rel, "workgroup");
248 case atomic::seq_cst:
249 return __builtin_amdgcn_fence(atomic::seq_cst, "workgroup");
250 }
251}
252void fenceKernel(atomic::OrderingTy Ordering) {
253 switch (Ordering) {
254 default:
255 __builtin_unreachable();
256 case atomic::aquire:
257 return __builtin_amdgcn_fence(atomic::aquire, "agent");
258 case atomic::release:
259 return __builtin_amdgcn_fence(atomic::release, "agent");
260 case atomic::acq_rel:
261 return __builtin_amdgcn_fence(atomic::acq_rel, "agent");
262 case atomic::seq_cst:
263 return __builtin_amdgcn_fence(atomic::seq_cst, "agent");
264 }
265}
266void fenceSystem(atomic::OrderingTy Ordering) {
267 switch (Ordering) {
268 default:
269 __builtin_unreachable();
270 case atomic::aquire:
271 return __builtin_amdgcn_fence(atomic::aquire, "");
272 case atomic::release:
273 return __builtin_amdgcn_fence(atomic::release, "");
274 case atomic::acq_rel:
275 return __builtin_amdgcn_fence(atomic::acq_rel, "");
276 case atomic::seq_cst:
277 return __builtin_amdgcn_fence(atomic::seq_cst, "");
278 }
279}
280
281void syncWarp(__kmpc_impl_lanemask_t) {
282 // This is a no-op on current AMDGPU hardware but it is used by the optimizer
283 // to enforce convergent behaviour between control flow graphs.
284 __builtin_amdgcn_wave_barrier();
285}
286
287void syncThreads(atomic::OrderingTy Ordering) {
288 if (Ordering != atomic::relaxed)
289 fenceTeam(Ordering == atomic::acq_rel ? atomic::release : atomic::seq_cst);
290
291 __builtin_amdgcn_s_barrier();
292
293 if (Ordering != atomic::relaxed)
294 fenceTeam(Ordering == atomic::acq_rel ? atomic::aquire : atomic::seq_cst);
295}
296void syncThreadsAligned(atomic::OrderingTy Ordering) { syncThreads(Ordering); }
297
298// TODO: Don't have wavefront lane locks. Possibly can't have them.
299void unsetLock(omp_lock_t *) { __builtin_trap(); }
300int testLock(omp_lock_t *) { __builtin_trap(); }
301void initLock(omp_lock_t *) { __builtin_trap(); }
302void destroyLock(omp_lock_t *) { __builtin_trap(); }
303void setLock(omp_lock_t *) { __builtin_trap(); }
304
305constexpr uint32_t UNSET = 0;
306constexpr uint32_t SET = 1;
307
308void unsetCriticalLock(omp_lock_t *Lock) {
309 (void)atomicExchange((uint32_t *)Lock, UNSET, atomic::acq_rel);
310}
311
312void setCriticalLock(omp_lock_t *Lock) {
313 uint64_t LowestActiveThread = utils::ffs(mapping::activemask()) - 1;
314 if (mapping::getThreadIdInWarp() == LowestActiveThread) {
315 fenceKernel(atomic::release);
316 while (!atomicCAS((uint32_t *)Lock, UNSET, SET, atomic::relaxed,
317 atomic::relaxed)) {
318 __builtin_amdgcn_s_sleep(32);
319 }
320 fenceKernel(atomic::aquire);
321 }
322}
323
324#pragma omp end declare variant
325///}
326
327/// NVPTX Implementation
328///
329///{
330#pragma omp begin declare variant match( \
331 device = {arch(nvptx, nvptx64)}, \
332 implementation = {extension(match_any)})
333
334uint32_t atomicInc(uint32_t *Address, uint32_t Val, atomic::OrderingTy Ordering,
335 atomic::MemScopeTy MemScope) {
336 return __nvvm_atom_inc_gen_ui(Address, Val);
337}
338
339void namedBarrierInit() {}
340
341void namedBarrier() {
342 uint32_t NumThreads = omp_get_num_threads();
343 ASSERT(NumThreads % 32 == 0, nullptr);
344
345 // The named barrier for active parallel threads of a team in an L1 parallel
346 // region to synchronize with each other.
347 constexpr int BarrierNo = 7;
348 __nvvm_barrier_sync_cnt(BarrierNo, NumThreads);
349}
350
351void fenceTeam(atomic::OrderingTy) { __nvvm_membar_cta(); }
352
353void fenceKernel(atomic::OrderingTy) { __nvvm_membar_gl(); }
354
355void fenceSystem(atomic::OrderingTy) { __nvvm_membar_sys(); }
356
357void syncWarp(__kmpc_impl_lanemask_t Mask) { __nvvm_bar_warp_sync(Mask); }
358
359void syncThreads(atomic::OrderingTy Ordering) {
360 constexpr int BarrierNo = 8;
361 __nvvm_barrier_sync(BarrierNo);
362}
363
364void syncThreadsAligned(atomic::OrderingTy Ordering) { __syncthreads(); }
365
366constexpr uint32_t OMP_SPIN = 1000;
367constexpr uint32_t UNSET = 0;
368constexpr uint32_t SET = 1;
369
370// TODO: This seems to hide a bug in the declare variant handling. If it is
371// called before it is defined
372// here the overload won't happen. Investigate lalter!
373void unsetLock(omp_lock_t *Lock) {
374 (void)atomicExchange((uint32_t *)Lock, UNSET, atomic::seq_cst);
375}
376
377int testLock(omp_lock_t *Lock) {
378 return atomicAdd((uint32_t *)Lock, 0u, atomic::seq_cst);
379}
380
381void initLock(omp_lock_t *Lock) { unsetLock(Lock); }
382
383void destroyLock(omp_lock_t *Lock) { unsetLock(Lock); }
384
385void setLock(omp_lock_t *Lock) {
386 // TODO: not sure spinning is a good idea here..
387 while (atomicCAS((uint32_t *)Lock, UNSET, SET, atomic::seq_cst,
388 atomic::seq_cst) != UNSET) {
389 int32_t start = __nvvm_read_ptx_sreg_clock();
390 int32_t now;
391 for (;;) {
392 now = __nvvm_read_ptx_sreg_clock();
393 int32_t cycles = now > start ? now - start : now + (0xffffffff - start);
394 if (cycles >= OMP_SPIN * mapping::getBlockIdInKernel()) {
395 break;
396 }
397 }
398 } // wait for 0 to be the read value
399}
400
401#pragma omp end declare variant
402///}
403
404} // namespace impl
405
406void synchronize::init(bool IsSPMD) {
407 if (!IsSPMD)
408 impl::namedBarrierInit();
409}
410
411void synchronize::warp(LaneMaskTy Mask) { impl::syncWarp(Mask); }
412
413void synchronize::threads(atomic::OrderingTy Ordering) {
414 impl::syncThreads(Ordering);
415}
416
417void synchronize::threadsAligned(atomic::OrderingTy Ordering) {
418 impl::syncThreadsAligned(Ordering);
419}
420
421void fence::team(atomic::OrderingTy Ordering) { impl::fenceTeam(Ordering); }
422
423void fence::kernel(atomic::OrderingTy Ordering) { impl::fenceKernel(Ordering); }
424
425void fence::system(atomic::OrderingTy Ordering) { impl::fenceSystem(Ordering); }
426
427#define ATOMIC_COMMON_OP(TY) \
428 TY atomic::add(TY *Addr, TY V, atomic::OrderingTy Ordering) { \
429 return impl::atomicAdd(Addr, V, Ordering); \
430 } \
431 TY atomic::mul(TY *Addr, TY V, atomic::OrderingTy Ordering) { \
432 return impl::atomicMul(Addr, V, Ordering); \
433 } \
434 TY atomic::load(TY *Addr, atomic::OrderingTy Ordering) { \
435 return impl::atomicLoad(Addr, Ordering); \
436 } \
437 bool atomic::cas(TY *Addr, TY ExpectedV, TY DesiredV, \
438 atomic::OrderingTy OrderingSucc, \
439 atomic::OrderingTy OrderingFail) { \
440 return impl::atomicCAS(Addr, ExpectedV, DesiredV, OrderingSucc, \
441 OrderingFail); \
442 }
443
444#define ATOMIC_FP_ONLY_OP(TY, STY, UTY) \
445 TY atomic::min(TY *Addr, TY V, atomic::OrderingTy Ordering) { \
446 return impl::atomicMinFP<TY, STY, UTY>(Addr, V, Ordering); \
447 } \
448 TY atomic::max(TY *Addr, TY V, atomic::OrderingTy Ordering) { \
449 return impl::atomicMaxFP<TY, STY, UTY>(Addr, V, Ordering); \
450 } \
451 void atomic::store(TY *Addr, TY V, atomic::OrderingTy Ordering) { \
452 impl::atomicStore(reinterpret_cast<UTY *>(Addr), \
453 utils::convertViaPun<UTY>(V), Ordering); \
454 }
455
456#define ATOMIC_INT_ONLY_OP(TY) \
457 TY atomic::min(TY *Addr, TY V, atomic::OrderingTy Ordering) { \
458 return impl::atomicMin<TY>(Addr, V, Ordering); \
459 } \
460 TY atomic::max(TY *Addr, TY V, atomic::OrderingTy Ordering) { \
461 return impl::atomicMax<TY>(Addr, V, Ordering); \
462 } \
463 TY atomic::bit_or(TY *Addr, TY V, atomic::OrderingTy Ordering) { \
464 return impl::atomicOr(Addr, V, Ordering); \
465 } \
466 TY atomic::bit_and(TY *Addr, TY V, atomic::OrderingTy Ordering) { \
467 return impl::atomicAnd(Addr, V, Ordering); \
468 } \
469 TY atomic::bit_xor(TY *Addr, TY V, atomic::OrderingTy Ordering) { \
470 return impl::atomicXOr(Addr, V, Ordering); \
471 } \
472 void atomic::store(TY *Addr, TY V, atomic::OrderingTy Ordering) { \
473 impl::atomicStore(Addr, V, Ordering); \
474 }
475
476#define ATOMIC_FP_OP(TY, STY, UTY) \
477 ATOMIC_FP_ONLY_OP(TY, STY, UTY) \
478 ATOMIC_COMMON_OP(TY)
479
480#define ATOMIC_INT_OP(TY) \
481 ATOMIC_INT_ONLY_OP(TY) \
482 ATOMIC_COMMON_OP(TY)
483
484// This needs to be kept in sync with the header. Also the reason we don't use
485// templates here.
486ATOMIC_INT_OP(int8_t)
487ATOMIC_INT_OP(int16_t)
488ATOMIC_INT_OP(int32_t)
489ATOMIC_INT_OP(int64_t)
490ATOMIC_INT_OP(uint8_t)
491ATOMIC_INT_OP(uint16_t)
492ATOMIC_INT_OP(uint32_t)
493ATOMIC_INT_OP(uint64_t)
494ATOMIC_FP_OP(float, int32_t, uint32_t)
495ATOMIC_FP_OP(double, int64_t, uint64_t)
496
497#undef ATOMIC_INT_ONLY_OP
498#undef ATOMIC_FP_ONLY_OP
499#undef ATOMIC_COMMON_OP
500#undef ATOMIC_INT_OP
501#undef ATOMIC_FP_OP
502
503uint32_t atomic::inc(uint32_t *Addr, uint32_t V, atomic::OrderingTy Ordering,
504 atomic::MemScopeTy MemScope) {
505 return impl::atomicInc(Addr, V, Ordering, MemScope);
506}
507
508void unsetCriticalLock(omp_lock_t *Lock) { impl::unsetLock(Lock); }
509
510void setCriticalLock(omp_lock_t *Lock) { impl::setLock(Lock); }
511
512extern "C" {
513void __kmpc_ordered(IdentTy *Loc, int32_t TId) {}
514
515void __kmpc_end_ordered(IdentTy *Loc, int32_t TId) {}
516
517int32_t __kmpc_cancel_barrier(IdentTy *Loc, int32_t TId) {
518 __kmpc_barrier(Loc, TId);
519 return 0;
520}
521
522void __kmpc_barrier(IdentTy *Loc, int32_t TId) {
523 if (mapping::isMainThreadInGenericMode())
524 return __kmpc_flush(Loc);
525
526 if (mapping::isSPMDMode())
527 return __kmpc_barrier_simple_spmd(Loc, TId);
528
529 impl::namedBarrier();
530}
531
532[[clang::noinline]] void __kmpc_barrier_simple_spmd(IdentTy *Loc, int32_t TId) {
533 synchronize::threadsAligned(atomic::OrderingTy::seq_cst);
534}
535
536[[clang::noinline]] void __kmpc_barrier_simple_generic(IdentTy *Loc,
537 int32_t TId) {
538 synchronize::threads(atomic::OrderingTy::seq_cst);
539}
540
541int32_t __kmpc_master(IdentTy *Loc, int32_t TId) {
542 return omp_get_thread_num() == 0;
543}
544
545void __kmpc_end_master(IdentTy *Loc, int32_t TId) {}
546
547int32_t __kmpc_masked(IdentTy *Loc, int32_t TId, int32_t Filter) {
548 return omp_get_thread_num() == Filter;
549}
550
551void __kmpc_end_masked(IdentTy *Loc, int32_t TId) {}
552
553int32_t __kmpc_single(IdentTy *Loc, int32_t TId) {
554 return __kmpc_master(Loc, TId);
555}
556
557void __kmpc_end_single(IdentTy *Loc, int32_t TId) {
558 // The barrier is explicitly called.
559}
560
561void __kmpc_flush(IdentTy *Loc) { fence::kernel(atomic::seq_cst); }
562
563uint64_t __kmpc_warp_active_thread_mask(void) { return mapping::activemask(); }
564
565void __kmpc_syncwarp(uint64_t Mask) { synchronize::warp(Mask); }
566
567void __kmpc_critical(IdentTy *Loc, int32_t TId, CriticalNameTy *Name) {
568 impl::setCriticalLock(reinterpret_cast<omp_lock_t *>(Name));
569}
570
571void __kmpc_end_critical(IdentTy *Loc, int32_t TId, CriticalNameTy *Name) {
572 impl::unsetCriticalLock(reinterpret_cast<omp_lock_t *>(Name));
573}
574
575void omp_init_lock(omp_lock_t *Lock) { impl::initLock(Lock); }
576
577void omp_destroy_lock(omp_lock_t *Lock) { impl::destroyLock(Lock); }
578
579void omp_set_lock(omp_lock_t *Lock) { impl::setLock(Lock); }
580
581void omp_unset_lock(omp_lock_t *Lock) { impl::unsetLock(Lock); }
582
583int omp_test_lock(omp_lock_t *Lock) { return impl::testLock(Lock); }
584
585void ompx_sync_block(int Ordering) {
586 impl::syncThreadsAligned(atomic::OrderingTy(Ordering));
587}
588void ompx_sync_block_acq_rel() {
589 impl::syncThreadsAligned(atomic::OrderingTy::acq_rel);
590}
591void ompx_sync_block_divergent(int Ordering) {
592 impl::syncThreads(atomic::OrderingTy(Ordering));
593}
594} // extern "C"
595
596#pragma omp end declare target
597

source code of offload/DeviceRTL/src/Synchronization.cpp