1 | //===---- Reduction.cpp - OpenMP device reduction implementation - C++ -*-===// |
2 | // |
3 | // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. |
4 | // See https://llvm.org/LICENSE.txt for license information. |
5 | // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception |
6 | // |
7 | //===----------------------------------------------------------------------===// |
8 | // |
9 | // This file contains the implementation of reduction with KMPC interface. |
10 | // |
11 | //===----------------------------------------------------------------------===// |
12 | |
13 | #include "Debug.h" |
14 | #include "DeviceTypes.h" |
15 | #include "DeviceUtils.h" |
16 | #include "Interface.h" |
17 | #include "Mapping.h" |
18 | #include "State.h" |
19 | #include "Synchronization.h" |
20 | |
21 | using namespace ompx; |
22 | |
23 | namespace { |
24 | |
25 | void gpu_regular_warp_reduce(void *reduce_data, ShuffleReductFnTy shflFct) { |
26 | for (uint32_t mask = mapping::getWarpSize() / 2; mask > 0; mask /= 2) { |
27 | shflFct(reduce_data, /*LaneId - not used= */ 0, |
28 | /*Offset = */ mask, /*AlgoVersion=*/0); |
29 | } |
30 | } |
31 | |
32 | void gpu_irregular_warp_reduce(void *reduce_data, ShuffleReductFnTy shflFct, |
33 | uint32_t size, uint32_t tid) { |
34 | uint32_t curr_size; |
35 | uint32_t mask; |
36 | curr_size = size; |
37 | mask = curr_size / 2; |
38 | while (mask > 0) { |
39 | shflFct(reduce_data, /*LaneId = */ tid, /*Offset=*/mask, /*AlgoVersion=*/1); |
40 | curr_size = (curr_size + 1) / 2; |
41 | mask = curr_size / 2; |
42 | } |
43 | } |
44 | |
45 | static uint32_t gpu_irregular_simd_reduce(void *reduce_data, |
46 | ShuffleReductFnTy shflFct) { |
47 | uint32_t size, remote_id, physical_lane_id; |
48 | physical_lane_id = mapping::getThreadIdInBlock() % mapping::getWarpSize(); |
49 | __kmpc_impl_lanemask_t lanemask_lt = mapping::lanemaskLT(); |
50 | __kmpc_impl_lanemask_t Liveness = mapping::activemask(); |
51 | uint32_t logical_lane_id = utils::popc(Liveness & lanemask_lt) * 2; |
52 | __kmpc_impl_lanemask_t lanemask_gt = mapping::lanemaskGT(); |
53 | do { |
54 | Liveness = mapping::activemask(); |
55 | remote_id = utils::ffs(Liveness & lanemask_gt); |
56 | size = utils::popc(Liveness); |
57 | logical_lane_id /= 2; |
58 | shflFct(reduce_data, /*LaneId =*/logical_lane_id, |
59 | /*Offset=*/remote_id - 1 - physical_lane_id, /*AlgoVersion=*/2); |
60 | } while (logical_lane_id % 2 == 0 && size > 1); |
61 | return (logical_lane_id == 0); |
62 | } |
63 | |
64 | static int32_t nvptx_parallel_reduce_nowait(void *reduce_data, |
65 | ShuffleReductFnTy shflFct, |
66 | InterWarpCopyFnTy cpyFct) { |
67 | uint32_t BlockThreadId = mapping::getThreadIdInBlock(); |
68 | if (mapping::isMainThreadInGenericMode(/*IsSPMD=*/false)) |
69 | BlockThreadId = 0; |
70 | uint32_t NumThreads = omp_get_num_threads(); |
71 | if (NumThreads == 1) |
72 | return 1; |
73 | |
74 | // |
75 | // This reduce function handles reduction within a team. It handles |
76 | // parallel regions in both L1 and L2 parallelism levels. It also |
77 | // supports Generic, SPMD, and NoOMP modes. |
78 | // |
79 | // 1. Reduce within a warp. |
80 | // 2. Warp master copies value to warp 0 via shared memory. |
81 | // 3. Warp 0 reduces to a single value. |
82 | // 4. The reduced value is available in the thread that returns 1. |
83 | // |
84 | |
85 | #if __has_builtin(__nvvm_reflect) |
86 | if (__nvvm_reflect("__CUDA_ARCH" ) >= 700) { |
87 | uint32_t WarpsNeeded = |
88 | (NumThreads + mapping::getWarpSize() - 1) / mapping::getWarpSize(); |
89 | uint32_t WarpId = mapping::getWarpIdInBlock(); |
90 | |
91 | // Volta execution model: |
92 | // For the Generic execution mode a parallel region either has 1 thread and |
93 | // beyond that, always a multiple of 32. For the SPMD execution mode we may |
94 | // have any number of threads. |
95 | if ((NumThreads % mapping::getWarpSize() == 0) || |
96 | (WarpId < WarpsNeeded - 1)) |
97 | gpu_regular_warp_reduce(reduce_data, shflFct); |
98 | else if (NumThreads > 1) // Only SPMD execution mode comes thru this case. |
99 | gpu_irregular_warp_reduce( |
100 | reduce_data, shflFct, |
101 | /*LaneCount=*/NumThreads % mapping::getWarpSize(), |
102 | /*LaneId=*/mapping::getThreadIdInBlock() % mapping::getWarpSize()); |
103 | |
104 | // When we have more than [mapping::getWarpSize()] number of threads |
105 | // a block reduction is performed here. |
106 | // |
107 | // Only L1 parallel region can enter this if condition. |
108 | if (NumThreads > mapping::getWarpSize()) { |
109 | // Gather all the reduced values from each warp |
110 | // to the first warp. |
111 | cpyFct(reduce_data, WarpsNeeded); |
112 | |
113 | if (WarpId == 0) |
114 | gpu_irregular_warp_reduce(reduce_data, shflFct, WarpsNeeded, |
115 | BlockThreadId); |
116 | } |
117 | return BlockThreadId == 0; |
118 | } |
119 | #endif |
120 | __kmpc_impl_lanemask_t Liveness = mapping::activemask(); |
121 | if (Liveness == lanes::All) // Full warp |
122 | gpu_regular_warp_reduce(reduce_data, shflFct); |
123 | else if (!(Liveness & (Liveness + 1))) // Partial warp but contiguous lanes |
124 | gpu_irregular_warp_reduce(reduce_data, shflFct, |
125 | /*LaneCount=*/utils::popc(Liveness), |
126 | /*LaneId=*/mapping::getThreadIdInBlock() % |
127 | mapping::getWarpSize()); |
128 | else { // Dispersed lanes. Only threads in L2 |
129 | // parallel region may enter here; return |
130 | // early. |
131 | return gpu_irregular_simd_reduce(reduce_data, shflFct); |
132 | } |
133 | |
134 | // When we have more than [mapping::getWarpSize()] number of threads |
135 | // a block reduction is performed here. |
136 | // |
137 | // Only L1 parallel region can enter this if condition. |
138 | if (NumThreads > mapping::getWarpSize()) { |
139 | uint32_t WarpsNeeded = |
140 | (NumThreads + mapping::getWarpSize() - 1) / mapping::getWarpSize(); |
141 | // Gather all the reduced values from each warp |
142 | // to the first warp. |
143 | cpyFct(reduce_data, WarpsNeeded); |
144 | |
145 | uint32_t WarpId = BlockThreadId / mapping::getWarpSize(); |
146 | if (WarpId == 0) |
147 | gpu_irregular_warp_reduce(reduce_data, shflFct, WarpsNeeded, |
148 | BlockThreadId); |
149 | |
150 | return BlockThreadId == 0; |
151 | } |
152 | |
153 | // Get the OMP thread Id. This is different from BlockThreadId in the case |
154 | // of an L2 parallel region. |
155 | return BlockThreadId == 0; |
156 | } |
157 | |
158 | uint32_t roundToWarpsize(uint32_t s) { |
159 | if (s < mapping::getWarpSize()) |
160 | return 1; |
161 | return (s & ~(unsigned)(mapping::getWarpSize() - 1)); |
162 | } |
163 | |
164 | uint32_t kmpcMin(uint32_t x, uint32_t y) { return x < y ? x : y; } |
165 | |
166 | } // namespace |
167 | |
168 | extern "C" { |
169 | int32_t __kmpc_nvptx_parallel_reduce_nowait_v2(IdentTy *Loc, |
170 | uint64_t reduce_data_size, |
171 | void *reduce_data, |
172 | ShuffleReductFnTy shflFct, |
173 | InterWarpCopyFnTy cpyFct) { |
174 | return nvptx_parallel_reduce_nowait(reduce_data, shflFct, cpyFct); |
175 | } |
176 | |
177 | int32_t __kmpc_nvptx_teams_reduce_nowait_v2( |
178 | IdentTy *Loc, void *GlobalBuffer, uint32_t num_of_records, |
179 | uint64_t reduce_data_size, void *reduce_data, ShuffleReductFnTy shflFct, |
180 | InterWarpCopyFnTy cpyFct, ListGlobalFnTy lgcpyFct, ListGlobalFnTy lgredFct, |
181 | ListGlobalFnTy glcpyFct, ListGlobalFnTy glredFct) { |
182 | // Terminate all threads in non-SPMD mode except for the master thread. |
183 | uint32_t ThreadId = mapping::getThreadIdInBlock(); |
184 | if (mapping::isGenericMode()) { |
185 | if (!mapping::isMainThreadInGenericMode()) |
186 | return 0; |
187 | ThreadId = 0; |
188 | } |
189 | |
190 | uint32_t &IterCnt = state::getKernelLaunchEnvironment().ReductionIterCnt; |
191 | uint32_t &Cnt = state::getKernelLaunchEnvironment().ReductionCnt; |
192 | |
193 | // In non-generic mode all workers participate in the teams reduction. |
194 | // In generic mode only the team master participates in the teams |
195 | // reduction because the workers are waiting for parallel work. |
196 | uint32_t NumThreads = omp_get_num_threads(); |
197 | uint32_t TeamId = omp_get_team_num(); |
198 | uint32_t NumTeams = omp_get_num_teams(); |
199 | [[clang::loader_uninitialized]] static Local<unsigned> Bound; |
200 | [[clang::loader_uninitialized]] static Local<unsigned> ChunkTeamCount; |
201 | |
202 | // Block progress for teams greater than the current upper |
203 | // limit. We always only allow a number of teams less or equal |
204 | // to the number of slots in the buffer. |
205 | bool IsMaster = (ThreadId == 0); |
206 | while (IsMaster) { |
207 | Bound = atomic::load(&IterCnt, atomic::acquire); |
208 | if (TeamId < Bound + num_of_records) |
209 | break; |
210 | } |
211 | |
212 | if (IsMaster) { |
213 | int ModBockId = TeamId % num_of_records; |
214 | if (TeamId < num_of_records) { |
215 | lgcpyFct(GlobalBuffer, ModBockId, reduce_data); |
216 | } else |
217 | lgredFct(GlobalBuffer, ModBockId, reduce_data); |
218 | |
219 | // Propagate the memory writes above to the world. |
220 | fence::kernel(atomic::release); |
221 | |
222 | // Increment team counter. |
223 | // This counter is incremented by all teams in the current |
224 | // num_of_records chunk. |
225 | ChunkTeamCount = atomic::inc(&Cnt, num_of_records - 1u, atomic::seq_cst, |
226 | atomic::MemScopeTy::device); |
227 | } |
228 | |
229 | // Synchronize in SPMD mode as in generic mode all but 1 threads are in the |
230 | // state machine. |
231 | if (mapping::isSPMDMode()) |
232 | synchronize::threadsAligned(atomic::acq_rel); |
233 | |
234 | // reduce_data is global or shared so before being reduced within the |
235 | // warp we need to bring it in local memory: |
236 | // local_reduce_data = reduce_data[i] |
237 | // |
238 | // Example for 3 reduction variables a, b, c (of potentially different |
239 | // types): |
240 | // |
241 | // buffer layout (struct of arrays): |
242 | // a, a, ..., a, b, b, ... b, c, c, ... c |
243 | // |__________| |
244 | // num_of_records |
245 | // |
246 | // local_data_reduce layout (struct): |
247 | // a, b, c |
248 | // |
249 | // Each thread will have a local struct containing the values to be |
250 | // reduced: |
251 | // 1. do reduction within each warp. |
252 | // 2. do reduction across warps. |
253 | // 3. write the final result to the main reduction variable |
254 | // by returning 1 in the thread holding the reduction result. |
255 | |
256 | // Check if this is the very last team. |
257 | unsigned NumRecs = kmpcMin(x: NumTeams, y: uint32_t(num_of_records)); |
258 | if (ChunkTeamCount == NumTeams - Bound - 1) { |
259 | // Ensure we see the global memory writes by other teams |
260 | fence::kernel(atomic::acquire); |
261 | |
262 | // |
263 | // Last team processing. |
264 | // |
265 | if (ThreadId >= NumRecs) |
266 | return 0; |
267 | NumThreads = roundToWarpsize(s: kmpcMin(x: NumThreads, y: NumRecs)); |
268 | if (ThreadId >= NumThreads) |
269 | return 0; |
270 | |
271 | // Load from buffer and reduce. |
272 | glcpyFct(GlobalBuffer, ThreadId, reduce_data); |
273 | for (uint32_t i = NumThreads + ThreadId; i < NumRecs; i += NumThreads) |
274 | glredFct(GlobalBuffer, i, reduce_data); |
275 | |
276 | // Reduce across warps to the warp master. |
277 | if (NumThreads > 1) { |
278 | gpu_regular_warp_reduce(reduce_data, shflFct); |
279 | |
280 | // When we have more than [mapping::getWarpSize()] number of threads |
281 | // a block reduction is performed here. |
282 | uint32_t ActiveThreads = kmpcMin(x: NumRecs, y: NumThreads); |
283 | if (ActiveThreads > mapping::getWarpSize()) { |
284 | uint32_t WarpsNeeded = (ActiveThreads + mapping::getWarpSize() - 1) / |
285 | mapping::getWarpSize(); |
286 | // Gather all the reduced values from each warp |
287 | // to the first warp. |
288 | cpyFct(reduce_data, WarpsNeeded); |
289 | |
290 | uint32_t WarpId = ThreadId / mapping::getWarpSize(); |
291 | if (WarpId == 0) |
292 | gpu_irregular_warp_reduce(reduce_data, shflFct, WarpsNeeded, |
293 | ThreadId); |
294 | } |
295 | } |
296 | |
297 | if (IsMaster) { |
298 | Cnt = 0; |
299 | IterCnt = 0; |
300 | return 1; |
301 | } |
302 | return 0; |
303 | } |
304 | if (IsMaster && ChunkTeamCount == num_of_records - 1) { |
305 | // Allow SIZE number of teams to proceed writing their |
306 | // intermediate results to the global buffer. |
307 | atomic::add(&IterCnt, uint32_t(num_of_records), atomic::seq_cst); |
308 | } |
309 | |
310 | return 0; |
311 | } |
312 | } |
313 | |
314 | void *__kmpc_reduction_get_fixed_buffer() { |
315 | return state::getKernelLaunchEnvironment().ReductionBuffer; |
316 | } |
317 | |