1 | //===---- Reduction.cpp - OpenMP device reduction implementation - C++ -*-===// |
2 | // |
3 | // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. |
4 | // See https://llvm.org/LICENSE.txt for license information. |
5 | // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception |
6 | // |
7 | //===----------------------------------------------------------------------===// |
8 | // |
9 | // This file contains the implementation of reduction with KMPC interface. |
10 | // |
11 | //===----------------------------------------------------------------------===// |
12 | |
13 | #include "Debug.h" |
14 | #include "Interface.h" |
15 | #include "Mapping.h" |
16 | #include "State.h" |
17 | #include "Synchronization.h" |
18 | #include "Types.h" |
19 | #include "Utils.h" |
20 | |
21 | using namespace ompx; |
22 | |
23 | namespace { |
24 | |
25 | #pragma omp begin declare target device_type(nohost) |
26 | |
27 | void gpu_regular_warp_reduce(void *reduce_data, ShuffleReductFnTy shflFct) { |
28 | for (uint32_t mask = mapping::getWarpSize() / 2; mask > 0; mask /= 2) { |
29 | shflFct(reduce_data, /*LaneId - not used= */ 0, |
30 | /*Offset = */ mask, /*AlgoVersion=*/0); |
31 | } |
32 | } |
33 | |
34 | void gpu_irregular_warp_reduce(void *reduce_data, ShuffleReductFnTy shflFct, |
35 | uint32_t size, uint32_t tid) { |
36 | uint32_t curr_size; |
37 | uint32_t mask; |
38 | curr_size = size; |
39 | mask = curr_size / 2; |
40 | while (mask > 0) { |
41 | shflFct(reduce_data, /*LaneId = */ tid, /*Offset=*/mask, /*AlgoVersion=*/1); |
42 | curr_size = (curr_size + 1) / 2; |
43 | mask = curr_size / 2; |
44 | } |
45 | } |
46 | |
47 | #if !defined(__CUDA_ARCH__) || __CUDA_ARCH__ < 700 |
48 | static uint32_t gpu_irregular_simd_reduce(void *reduce_data, |
49 | ShuffleReductFnTy shflFct) { |
50 | uint32_t size, remote_id, physical_lane_id; |
51 | physical_lane_id = mapping::getThreadIdInBlock() % mapping::getWarpSize(); |
52 | __kmpc_impl_lanemask_t lanemask_lt = mapping::lanemaskLT(); |
53 | __kmpc_impl_lanemask_t Liveness = mapping::activemask(); |
54 | uint32_t logical_lane_id = utils::popc(Liveness & lanemask_lt) * 2; |
55 | __kmpc_impl_lanemask_t lanemask_gt = mapping::lanemaskGT(); |
56 | do { |
57 | Liveness = mapping::activemask(); |
58 | remote_id = utils::ffs(Liveness & lanemask_gt); |
59 | size = utils::popc(Liveness); |
60 | logical_lane_id /= 2; |
61 | shflFct(reduce_data, /*LaneId =*/logical_lane_id, |
62 | /*Offset=*/remote_id - 1 - physical_lane_id, /*AlgoVersion=*/2); |
63 | } while (logical_lane_id % 2 == 0 && size > 1); |
64 | return (logical_lane_id == 0); |
65 | } |
66 | #endif |
67 | |
68 | static int32_t nvptx_parallel_reduce_nowait(void *reduce_data, |
69 | ShuffleReductFnTy shflFct, |
70 | InterWarpCopyFnTy cpyFct) { |
71 | uint32_t BlockThreadId = mapping::getThreadIdInBlock(); |
72 | if (mapping::isMainThreadInGenericMode(/*IsSPMD=*/false)) |
73 | BlockThreadId = 0; |
74 | uint32_t NumThreads = omp_get_num_threads(); |
75 | if (NumThreads == 1) |
76 | return 1; |
77 | /* |
78 | * This reduce function handles reduction within a team. It handles |
79 | * parallel regions in both L1 and L2 parallelism levels. It also |
80 | * supports Generic, SPMD, and NoOMP modes. |
81 | * |
82 | * 1. Reduce within a warp. |
83 | * 2. Warp master copies value to warp 0 via shared memory. |
84 | * 3. Warp 0 reduces to a single value. |
85 | * 4. The reduced value is available in the thread that returns 1. |
86 | */ |
87 | |
88 | #if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 700 |
89 | uint32_t WarpsNeeded = |
90 | (NumThreads + mapping::getWarpSize() - 1) / mapping::getWarpSize(); |
91 | uint32_t WarpId = mapping::getWarpIdInBlock(); |
92 | |
93 | // Volta execution model: |
94 | // For the Generic execution mode a parallel region either has 1 thread and |
95 | // beyond that, always a multiple of 32. For the SPMD execution mode we may |
96 | // have any number of threads. |
97 | if ((NumThreads % mapping::getWarpSize() == 0) || (WarpId < WarpsNeeded - 1)) |
98 | gpu_regular_warp_reduce(reduce_data, shflFct); |
99 | else if (NumThreads > 1) // Only SPMD execution mode comes thru this case. |
100 | gpu_irregular_warp_reduce(reduce_data, shflFct, |
101 | /*LaneCount=*/NumThreads % mapping::getWarpSize(), |
102 | /*LaneId=*/mapping::getThreadIdInBlock() % |
103 | mapping::getWarpSize()); |
104 | |
105 | // When we have more than [mapping::getWarpSize()] number of threads |
106 | // a block reduction is performed here. |
107 | // |
108 | // Only L1 parallel region can enter this if condition. |
109 | if (NumThreads > mapping::getWarpSize()) { |
110 | // Gather all the reduced values from each warp |
111 | // to the first warp. |
112 | cpyFct(reduce_data, WarpsNeeded); |
113 | |
114 | if (WarpId == 0) |
115 | gpu_irregular_warp_reduce(reduce_data, shflFct, WarpsNeeded, |
116 | BlockThreadId); |
117 | } |
118 | return BlockThreadId == 0; |
119 | #else |
120 | __kmpc_impl_lanemask_t Liveness = mapping::activemask(); |
121 | if (Liveness == lanes::All) // Full warp |
122 | gpu_regular_warp_reduce(reduce_data, shflFct); |
123 | else if (!(Liveness & (Liveness + 1))) // Partial warp but contiguous lanes |
124 | gpu_irregular_warp_reduce(reduce_data, shflFct, |
125 | /*LaneCount=*/utils::popc(Liveness), |
126 | /*LaneId=*/mapping::getThreadIdInBlock() % |
127 | mapping::getWarpSize()); |
128 | else { // Dispersed lanes. Only threads in L2 |
129 | // parallel region may enter here; return |
130 | // early. |
131 | return gpu_irregular_simd_reduce(reduce_data, shflFct); |
132 | } |
133 | |
134 | // When we have more than [mapping::getWarpSize()] number of threads |
135 | // a block reduction is performed here. |
136 | // |
137 | // Only L1 parallel region can enter this if condition. |
138 | if (NumThreads > mapping::getWarpSize()) { |
139 | uint32_t WarpsNeeded = |
140 | (NumThreads + mapping::getWarpSize() - 1) / mapping::getWarpSize(); |
141 | // Gather all the reduced values from each warp |
142 | // to the first warp. |
143 | cpyFct(reduce_data, WarpsNeeded); |
144 | |
145 | uint32_t WarpId = BlockThreadId / mapping::getWarpSize(); |
146 | if (WarpId == 0) |
147 | gpu_irregular_warp_reduce(reduce_data, shflFct, WarpsNeeded, |
148 | BlockThreadId); |
149 | |
150 | return BlockThreadId == 0; |
151 | } |
152 | |
153 | // Get the OMP thread Id. This is different from BlockThreadId in the case of |
154 | // an L2 parallel region. |
155 | return BlockThreadId == 0; |
156 | #endif // __CUDA_ARCH__ >= 700 |
157 | } |
158 | |
159 | uint32_t roundToWarpsize(uint32_t s) { |
160 | if (s < mapping::getWarpSize()) |
161 | return 1; |
162 | return (s & ~(unsigned)(mapping::getWarpSize() - 1)); |
163 | } |
164 | |
165 | uint32_t kmpcMin(uint32_t x, uint32_t y) { return x < y ? x : y; } |
166 | |
167 | } // namespace |
168 | |
169 | extern "C" { |
170 | int32_t __kmpc_nvptx_parallel_reduce_nowait_v2(IdentTy *Loc, |
171 | uint64_t reduce_data_size, |
172 | void *reduce_data, |
173 | ShuffleReductFnTy shflFct, |
174 | InterWarpCopyFnTy cpyFct) { |
175 | return nvptx_parallel_reduce_nowait(reduce_data, shflFct, cpyFct); |
176 | } |
177 | |
178 | int32_t __kmpc_nvptx_teams_reduce_nowait_v2( |
179 | IdentTy *Loc, void *GlobalBuffer, uint32_t num_of_records, |
180 | uint64_t reduce_data_size, void *reduce_data, ShuffleReductFnTy shflFct, |
181 | InterWarpCopyFnTy cpyFct, ListGlobalFnTy lgcpyFct, ListGlobalFnTy lgredFct, |
182 | ListGlobalFnTy glcpyFct, ListGlobalFnTy glredFct) { |
183 | // Terminate all threads in non-SPMD mode except for the master thread. |
184 | uint32_t ThreadId = mapping::getThreadIdInBlock(); |
185 | if (mapping::isGenericMode()) { |
186 | if (!mapping::isMainThreadInGenericMode()) |
187 | return 0; |
188 | ThreadId = 0; |
189 | } |
190 | |
191 | uint32_t &IterCnt = state::getKernelLaunchEnvironment().ReductionIterCnt; |
192 | uint32_t &Cnt = state::getKernelLaunchEnvironment().ReductionCnt; |
193 | |
194 | // In non-generic mode all workers participate in the teams reduction. |
195 | // In generic mode only the team master participates in the teams |
196 | // reduction because the workers are waiting for parallel work. |
197 | uint32_t NumThreads = omp_get_num_threads(); |
198 | uint32_t TeamId = omp_get_team_num(); |
199 | uint32_t NumTeams = omp_get_num_teams(); |
200 | static unsigned SHARED(Bound); |
201 | static unsigned SHARED(ChunkTeamCount); |
202 | |
203 | // Block progress for teams greater than the current upper |
204 | // limit. We always only allow a number of teams less or equal |
205 | // to the number of slots in the buffer. |
206 | bool IsMaster = (ThreadId == 0); |
207 | while (IsMaster) { |
208 | Bound = atomic::load(&IterCnt, atomic::aquire); |
209 | if (TeamId < Bound + num_of_records) |
210 | break; |
211 | } |
212 | |
213 | if (IsMaster) { |
214 | int ModBockId = TeamId % num_of_records; |
215 | if (TeamId < num_of_records) { |
216 | lgcpyFct(GlobalBuffer, ModBockId, reduce_data); |
217 | } else |
218 | lgredFct(GlobalBuffer, ModBockId, reduce_data); |
219 | |
220 | // Propagate the memory writes above to the world. |
221 | fence::kernel(atomic::release); |
222 | |
223 | // Increment team counter. |
224 | // This counter is incremented by all teams in the current |
225 | // num_of_records chunk. |
226 | ChunkTeamCount = atomic::inc(&Cnt, num_of_records - 1u, atomic::seq_cst, |
227 | atomic::MemScopeTy::device); |
228 | } |
229 | |
230 | // Synchronize in SPMD mode as in generic mode all but 1 threads are in the |
231 | // state machine. |
232 | if (mapping::isSPMDMode()) |
233 | synchronize::threadsAligned(atomic::acq_rel); |
234 | |
235 | // reduce_data is global or shared so before being reduced within the |
236 | // warp we need to bring it in local memory: |
237 | // local_reduce_data = reduce_data[i] |
238 | // |
239 | // Example for 3 reduction variables a, b, c (of potentially different |
240 | // types): |
241 | // |
242 | // buffer layout (struct of arrays): |
243 | // a, a, ..., a, b, b, ... b, c, c, ... c |
244 | // |__________| |
245 | // num_of_records |
246 | // |
247 | // local_data_reduce layout (struct): |
248 | // a, b, c |
249 | // |
250 | // Each thread will have a local struct containing the values to be |
251 | // reduced: |
252 | // 1. do reduction within each warp. |
253 | // 2. do reduction across warps. |
254 | // 3. write the final result to the main reduction variable |
255 | // by returning 1 in the thread holding the reduction result. |
256 | |
257 | // Check if this is the very last team. |
258 | unsigned NumRecs = kmpcMin(x: NumTeams, y: uint32_t(num_of_records)); |
259 | if (ChunkTeamCount == NumTeams - Bound - 1) { |
260 | // Ensure we see the global memory writes by other teams |
261 | fence::kernel(atomic::aquire); |
262 | |
263 | // |
264 | // Last team processing. |
265 | // |
266 | if (ThreadId >= NumRecs) |
267 | return 0; |
268 | NumThreads = roundToWarpsize(s: kmpcMin(x: NumThreads, y: NumRecs)); |
269 | if (ThreadId >= NumThreads) |
270 | return 0; |
271 | |
272 | // Load from buffer and reduce. |
273 | glcpyFct(GlobalBuffer, ThreadId, reduce_data); |
274 | for (uint32_t i = NumThreads + ThreadId; i < NumRecs; i += NumThreads) |
275 | glredFct(GlobalBuffer, i, reduce_data); |
276 | |
277 | // Reduce across warps to the warp master. |
278 | if (NumThreads > 1) { |
279 | gpu_regular_warp_reduce(reduce_data, shflFct); |
280 | |
281 | // When we have more than [mapping::getWarpSize()] number of threads |
282 | // a block reduction is performed here. |
283 | uint32_t ActiveThreads = kmpcMin(x: NumRecs, y: NumThreads); |
284 | if (ActiveThreads > mapping::getWarpSize()) { |
285 | uint32_t WarpsNeeded = (ActiveThreads + mapping::getWarpSize() - 1) / |
286 | mapping::getWarpSize(); |
287 | // Gather all the reduced values from each warp |
288 | // to the first warp. |
289 | cpyFct(reduce_data, WarpsNeeded); |
290 | |
291 | uint32_t WarpId = ThreadId / mapping::getWarpSize(); |
292 | if (WarpId == 0) |
293 | gpu_irregular_warp_reduce(reduce_data, shflFct, WarpsNeeded, |
294 | ThreadId); |
295 | } |
296 | } |
297 | |
298 | if (IsMaster) { |
299 | Cnt = 0; |
300 | IterCnt = 0; |
301 | return 1; |
302 | } |
303 | return 0; |
304 | } |
305 | if (IsMaster && ChunkTeamCount == num_of_records - 1) { |
306 | // Allow SIZE number of teams to proceed writing their |
307 | // intermediate results to the global buffer. |
308 | atomic::add(&IterCnt, uint32_t(num_of_records), atomic::seq_cst); |
309 | } |
310 | |
311 | return 0; |
312 | } |
313 | } |
314 | |
315 | void *__kmpc_reduction_get_fixed_buffer() { |
316 | return state::getKernelLaunchEnvironment().ReductionBuffer; |
317 | } |
318 | |
319 | #pragma omp end declare target |
320 | |