Warning: This file is not a C or C++ file. It does not have highlighting.
1 | //===- Synchronization.h - OpenMP synchronization utilities ------- C++ -*-===// |
---|---|
2 | // |
3 | // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. |
4 | // See https://llvm.org/LICENSE.txt for license information. |
5 | // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception |
6 | // |
7 | //===----------------------------------------------------------------------===// |
8 | // |
9 | // |
10 | //===----------------------------------------------------------------------===// |
11 | |
12 | #ifndef OMPTARGET_DEVICERTL_SYNCHRONIZATION_H |
13 | #define OMPTARGET_DEVICERTL_SYNCHRONIZATION_H |
14 | |
15 | #include "DeviceTypes.h" |
16 | #include "DeviceUtils.h" |
17 | |
18 | namespace ompx { |
19 | namespace atomic { |
20 | |
21 | enum OrderingTy { |
22 | relaxed = __ATOMIC_RELAXED, |
23 | acquire = __ATOMIC_ACQUIRE, |
24 | release = __ATOMIC_RELEASE, |
25 | acq_rel = __ATOMIC_ACQ_REL, |
26 | seq_cst = __ATOMIC_SEQ_CST, |
27 | }; |
28 | |
29 | enum MemScopeTy { |
30 | system = __MEMORY_SCOPE_SYSTEM, |
31 | device = __MEMORY_SCOPE_DEVICE, |
32 | workgroup = __MEMORY_SCOPE_WRKGRP, |
33 | wavefront = __MEMORY_SCOPE_WVFRNT, |
34 | single = __MEMORY_SCOPE_SINGLE, |
35 | }; |
36 | |
37 | /// Atomically increment \p *Addr and wrap at \p V with \p Ordering semantics. |
38 | uint32_t inc(uint32_t *Addr, uint32_t V, OrderingTy Ordering, |
39 | MemScopeTy MemScope = MemScopeTy::device); |
40 | |
41 | /// Atomically perform <op> on \p V and \p *Addr with \p Ordering semantics. The |
42 | /// result is stored in \p *Addr; |
43 | /// { |
44 | |
45 | template <typename Ty, typename V = utils::remove_addrspace_t<Ty>> |
46 | bool cas(Ty *Address, V ExpectedV, V DesiredV, atomic::OrderingTy OrderingSucc, |
47 | atomic::OrderingTy OrderingFail, |
48 | MemScopeTy MemScope = MemScopeTy::device) { |
49 | return __scoped_atomic_compare_exchange(Address, &ExpectedV, &DesiredV, false, |
50 | OrderingSucc, OrderingFail, MemScope); |
51 | } |
52 | |
53 | template <typename Ty, typename V = utils::remove_addrspace_t<Ty>> |
54 | V add(Ty *Address, V Val, atomic::OrderingTy Ordering, |
55 | MemScopeTy MemScope = MemScopeTy::device) { |
56 | return __scoped_atomic_fetch_add(Address, Val, Ordering, MemScope); |
57 | } |
58 | |
59 | template <typename Ty, typename V = utils::remove_addrspace_t<Ty>> |
60 | V load(Ty *Address, atomic::OrderingTy Ordering, |
61 | MemScopeTy MemScope = MemScopeTy::device) { |
62 | #ifdef __NVPTX__ |
63 | return __scoped_atomic_fetch_add(Address, V(0), Ordering, MemScope); |
64 | #else |
65 | return __scoped_atomic_load_n(Address, Ordering, MemScope); |
66 | #endif |
67 | } |
68 | |
69 | template <typename Ty, typename V = utils::remove_addrspace_t<Ty>> |
70 | void store(Ty *Address, V Val, atomic::OrderingTy Ordering, |
71 | MemScopeTy MemScope = MemScopeTy::device) { |
72 | __scoped_atomic_store_n(Address, Val, Ordering, MemScope); |
73 | } |
74 | |
75 | template <typename Ty, typename V = utils::remove_addrspace_t<Ty>> |
76 | V mul(Ty *Address, V Val, atomic::OrderingTy Ordering, |
77 | MemScopeTy MemScope = MemScopeTy::device) { |
78 | Ty TypedCurrentVal, TypedResultVal, TypedNewVal; |
79 | bool Success; |
80 | do { |
81 | TypedCurrentVal = atomic::load(Address, Ordering); |
82 | TypedNewVal = TypedCurrentVal * Val; |
83 | Success = atomic::cas(Address, TypedCurrentVal, TypedNewVal, Ordering, |
84 | atomic::relaxed, MemScope); |
85 | } while (!Success); |
86 | return TypedResultVal; |
87 | } |
88 | |
89 | template <typename Ty, typename V = utils::remove_addrspace_t<Ty>> |
90 | utils::enable_if_t<!utils::is_floating_point_v<V>, V> |
91 | max(Ty *Address, V Val, atomic::OrderingTy Ordering, |
92 | MemScopeTy MemScope = MemScopeTy::device) { |
93 | return __scoped_atomic_fetch_max(Address, Val, Ordering, MemScope); |
94 | } |
95 | |
96 | template <typename Ty, typename V = utils::remove_addrspace_t<Ty>> |
97 | utils::enable_if_t<utils::is_same_v<V, float>, V> |
98 | max(Ty *Address, V Val, atomic::OrderingTy Ordering, |
99 | MemScopeTy MemScope = MemScopeTy::device) { |
100 | if (Val >= 0) |
101 | return utils::bitCast<float>(max( |
102 | (int32_t *)Address, utils::bitCast<int32_t>(Val), Ordering, MemScope)); |
103 | return utils::bitCast<float>(min( |
104 | (uint32_t *)Address, utils::bitCast<uint32_t>(Val), Ordering, MemScope)); |
105 | } |
106 | |
107 | template <typename Ty, typename V = utils::remove_addrspace_t<Ty>> |
108 | utils::enable_if_t<utils::is_same_v<V, double>, V> |
109 | max(Ty *Address, V Val, atomic::OrderingTy Ordering, |
110 | MemScopeTy MemScope = MemScopeTy::device) { |
111 | if (Val >= 0) |
112 | return utils::bitCast<double>(max( |
113 | (int64_t *)Address, utils::bitCast<int64_t>(Val), Ordering, MemScope)); |
114 | return utils::bitCast<double>(min( |
115 | (uint64_t *)Address, utils::bitCast<uint64_t>(Val), Ordering, MemScope)); |
116 | } |
117 | |
118 | template <typename Ty, typename V = utils::remove_addrspace_t<Ty>> |
119 | utils::enable_if_t<!utils::is_floating_point_v<V>, V> |
120 | min(Ty *Address, V Val, atomic::OrderingTy Ordering, |
121 | MemScopeTy MemScope = MemScopeTy::device) { |
122 | return __scoped_atomic_fetch_min(Address, Val, Ordering, MemScope); |
123 | } |
124 | |
125 | // TODO: Implement this with __atomic_fetch_max and remove the duplication. |
126 | template <typename Ty, typename V = utils::remove_addrspace_t<Ty>> |
127 | utils::enable_if_t<utils::is_same_v<V, float>, V> |
128 | min(Ty *Address, V Val, atomic::OrderingTy Ordering, |
129 | MemScopeTy MemScope = MemScopeTy::device) { |
130 | if (Val >= 0) |
131 | return utils::bitCast<float>(min( |
132 | (int32_t *)Address, utils::bitCast<int32_t>(Val), Ordering, MemScope)); |
133 | return utils::bitCast<float>(max( |
134 | (uint32_t *)Address, utils::bitCast<uint32_t>(Val), Ordering, MemScope)); |
135 | } |
136 | |
137 | // TODO: Implement this with __atomic_fetch_max and remove the duplication. |
138 | template <typename Ty, typename V = utils::remove_addrspace_t<Ty>> |
139 | utils::enable_if_t<utils::is_same_v<V, double>, V> |
140 | min(Ty *Address, utils::remove_addrspace_t<Ty> Val, atomic::OrderingTy Ordering, |
141 | MemScopeTy MemScope = MemScopeTy::device) { |
142 | if (Val >= 0) |
143 | return utils::bitCast<double>(min( |
144 | (int64_t *)Address, utils::bitCast<int64_t>(Val), Ordering, MemScope)); |
145 | return utils::bitCast<double>(max( |
146 | (uint64_t *)Address, utils::bitCast<uint64_t>(Val), Ordering, MemScope)); |
147 | } |
148 | |
149 | template <typename Ty, typename V = utils::remove_addrspace_t<Ty>> |
150 | V bit_or(Ty *Address, V Val, atomic::OrderingTy Ordering, |
151 | MemScopeTy MemScope = MemScopeTy::device) { |
152 | return __scoped_atomic_fetch_or(Address, Val, Ordering, MemScope); |
153 | } |
154 | |
155 | template <typename Ty, typename V = utils::remove_addrspace_t<Ty>> |
156 | V bit_and(Ty *Address, V Val, atomic::OrderingTy Ordering, |
157 | MemScopeTy MemScope = MemScopeTy::device) { |
158 | return __scoped_atomic_fetch_and(Address, Val, Ordering, MemScope); |
159 | } |
160 | |
161 | template <typename Ty, typename V = utils::remove_addrspace_t<Ty>> |
162 | V bit_xor(Ty *Address, V Val, atomic::OrderingTy Ordering, |
163 | MemScopeTy MemScope = MemScopeTy::device) { |
164 | return __scoped_atomic_fetch_xor(Address, Val, Ordering, MemScope); |
165 | } |
166 | |
167 | static inline uint32_t |
168 | atomicExchange(uint32_t *Address, uint32_t Val, atomic::OrderingTy Ordering, |
169 | MemScopeTy MemScope = MemScopeTy::device) { |
170 | uint32_t R; |
171 | __scoped_atomic_exchange(Address, &Val, &R, Ordering, MemScope); |
172 | return R; |
173 | } |
174 | |
175 | ///} |
176 | |
177 | } // namespace atomic |
178 | |
179 | namespace synchronize { |
180 | |
181 | /// Initialize the synchronization machinery. Must be called by all threads. |
182 | void init(bool IsSPMD); |
183 | |
184 | /// Synchronize all threads in a warp identified by \p Mask. |
185 | void warp(LaneMaskTy Mask); |
186 | |
187 | /// Synchronize all threads in a block and perform a fence before and after the |
188 | /// barrier according to \p Ordering. Note that the fence might be part of the |
189 | /// barrier. |
190 | void threads(atomic::OrderingTy Ordering); |
191 | |
192 | /// Synchronizing threads is allowed even if they all hit different instances of |
193 | /// `synchronize::threads()`. However, `synchronize::threadsAligned()` is more |
194 | /// restrictive in that it requires all threads to hit the same instance. The |
195 | /// noinline is removed by the openmp-opt pass and helps to preserve the |
196 | /// information till then. |
197 | ///{ |
198 | |
199 | /// Synchronize all threads in a block, they are reaching the same instruction |
200 | /// (hence all threads in the block are "aligned"). Also perform a fence before |
201 | /// and after the barrier according to \p Ordering. Note that the |
202 | /// fence might be part of the barrier if the target offers this. |
203 | [[gnu::noinline, omp::assume("ompx_aligned_barrier")]] void |
204 | threadsAligned(atomic::OrderingTy Ordering); |
205 | |
206 | ///} |
207 | |
208 | } // namespace synchronize |
209 | |
210 | namespace fence { |
211 | |
212 | /// Memory fence with \p Ordering semantics for the team. |
213 | void team(atomic::OrderingTy Ordering); |
214 | |
215 | /// Memory fence with \p Ordering semantics for the contention group. |
216 | void kernel(atomic::OrderingTy Ordering); |
217 | |
218 | /// Memory fence with \p Ordering semantics for the system. |
219 | void system(atomic::OrderingTy Ordering); |
220 | |
221 | } // namespace fence |
222 | |
223 | } // namespace ompx |
224 | |
225 | #endif |
226 |
Warning: This file is not a C or C++ file. It does not have highlighting.