1//===---- Reduction.cpp - OpenMP device reduction implementation - C++ -*-===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9// This file contains the implementation of reduction with KMPC interface.
10//
11//===----------------------------------------------------------------------===//
12
13#include "Debug.h"
14#include "DeviceTypes.h"
15#include "DeviceUtils.h"
16#include "Interface.h"
17#include "Mapping.h"
18#include "State.h"
19#include "Synchronization.h"
20
21using namespace ompx;
22
23namespace {
24
25void gpu_regular_warp_reduce(void *reduce_data, ShuffleReductFnTy shflFct) {
26 for (uint32_t mask = mapping::getWarpSize() / 2; mask > 0; mask /= 2) {
27 shflFct(reduce_data, /*LaneId - not used= */ 0,
28 /*Offset = */ mask, /*AlgoVersion=*/0);
29 }
30}
31
32void gpu_irregular_warp_reduce(void *reduce_data, ShuffleReductFnTy shflFct,
33 uint32_t size, uint32_t tid) {
34 uint32_t curr_size;
35 uint32_t mask;
36 curr_size = size;
37 mask = curr_size / 2;
38 while (mask > 0) {
39 shflFct(reduce_data, /*LaneId = */ tid, /*Offset=*/mask, /*AlgoVersion=*/1);
40 curr_size = (curr_size + 1) / 2;
41 mask = curr_size / 2;
42 }
43}
44
45static uint32_t gpu_irregular_simd_reduce(void *reduce_data,
46 ShuffleReductFnTy shflFct) {
47 uint32_t size, remote_id, physical_lane_id;
48 physical_lane_id = mapping::getThreadIdInBlock() % mapping::getWarpSize();
49 __kmpc_impl_lanemask_t lanemask_lt = mapping::lanemaskLT();
50 __kmpc_impl_lanemask_t Liveness = mapping::activemask();
51 uint32_t logical_lane_id = utils::popc(Liveness & lanemask_lt) * 2;
52 __kmpc_impl_lanemask_t lanemask_gt = mapping::lanemaskGT();
53 do {
54 Liveness = mapping::activemask();
55 remote_id = utils::ffs(Liveness & lanemask_gt);
56 size = utils::popc(Liveness);
57 logical_lane_id /= 2;
58 shflFct(reduce_data, /*LaneId =*/logical_lane_id,
59 /*Offset=*/remote_id - 1 - physical_lane_id, /*AlgoVersion=*/2);
60 } while (logical_lane_id % 2 == 0 && size > 1);
61 return (logical_lane_id == 0);
62}
63
64static int32_t nvptx_parallel_reduce_nowait(void *reduce_data,
65 ShuffleReductFnTy shflFct,
66 InterWarpCopyFnTy cpyFct) {
67 uint32_t BlockThreadId = mapping::getThreadIdInBlock();
68 if (mapping::isMainThreadInGenericMode(/*IsSPMD=*/false))
69 BlockThreadId = 0;
70 uint32_t NumThreads = omp_get_num_threads();
71 if (NumThreads == 1)
72 return 1;
73
74 //
75 // This reduce function handles reduction within a team. It handles
76 // parallel regions in both L1 and L2 parallelism levels. It also
77 // supports Generic, SPMD, and NoOMP modes.
78 //
79 // 1. Reduce within a warp.
80 // 2. Warp master copies value to warp 0 via shared memory.
81 // 3. Warp 0 reduces to a single value.
82 // 4. The reduced value is available in the thread that returns 1.
83 //
84
85#if __has_builtin(__nvvm_reflect)
86 if (__nvvm_reflect("__CUDA_ARCH") >= 700) {
87 uint32_t WarpsNeeded =
88 (NumThreads + mapping::getWarpSize() - 1) / mapping::getWarpSize();
89 uint32_t WarpId = mapping::getWarpIdInBlock();
90
91 // Volta execution model:
92 // For the Generic execution mode a parallel region either has 1 thread and
93 // beyond that, always a multiple of 32. For the SPMD execution mode we may
94 // have any number of threads.
95 if ((NumThreads % mapping::getWarpSize() == 0) ||
96 (WarpId < WarpsNeeded - 1))
97 gpu_regular_warp_reduce(reduce_data, shflFct);
98 else if (NumThreads > 1) // Only SPMD execution mode comes thru this case.
99 gpu_irregular_warp_reduce(
100 reduce_data, shflFct,
101 /*LaneCount=*/NumThreads % mapping::getWarpSize(),
102 /*LaneId=*/mapping::getThreadIdInBlock() % mapping::getWarpSize());
103
104 // When we have more than [mapping::getWarpSize()] number of threads
105 // a block reduction is performed here.
106 //
107 // Only L1 parallel region can enter this if condition.
108 if (NumThreads > mapping::getWarpSize()) {
109 // Gather all the reduced values from each warp
110 // to the first warp.
111 cpyFct(reduce_data, WarpsNeeded);
112
113 if (WarpId == 0)
114 gpu_irregular_warp_reduce(reduce_data, shflFct, WarpsNeeded,
115 BlockThreadId);
116 }
117 return BlockThreadId == 0;
118 }
119#endif
120 __kmpc_impl_lanemask_t Liveness = mapping::activemask();
121 if (Liveness == lanes::All) // Full warp
122 gpu_regular_warp_reduce(reduce_data, shflFct);
123 else if (!(Liveness & (Liveness + 1))) // Partial warp but contiguous lanes
124 gpu_irregular_warp_reduce(reduce_data, shflFct,
125 /*LaneCount=*/utils::popc(Liveness),
126 /*LaneId=*/mapping::getThreadIdInBlock() %
127 mapping::getWarpSize());
128 else { // Dispersed lanes. Only threads in L2
129 // parallel region may enter here; return
130 // early.
131 return gpu_irregular_simd_reduce(reduce_data, shflFct);
132 }
133
134 // When we have more than [mapping::getWarpSize()] number of threads
135 // a block reduction is performed here.
136 //
137 // Only L1 parallel region can enter this if condition.
138 if (NumThreads > mapping::getWarpSize()) {
139 uint32_t WarpsNeeded =
140 (NumThreads + mapping::getWarpSize() - 1) / mapping::getWarpSize();
141 // Gather all the reduced values from each warp
142 // to the first warp.
143 cpyFct(reduce_data, WarpsNeeded);
144
145 uint32_t WarpId = BlockThreadId / mapping::getWarpSize();
146 if (WarpId == 0)
147 gpu_irregular_warp_reduce(reduce_data, shflFct, WarpsNeeded,
148 BlockThreadId);
149
150 return BlockThreadId == 0;
151 }
152
153 // Get the OMP thread Id. This is different from BlockThreadId in the case
154 // of an L2 parallel region.
155 return BlockThreadId == 0;
156}
157
158uint32_t roundToWarpsize(uint32_t s) {
159 if (s < mapping::getWarpSize())
160 return 1;
161 return (s & ~(unsigned)(mapping::getWarpSize() - 1));
162}
163
164uint32_t kmpcMin(uint32_t x, uint32_t y) { return x < y ? x : y; }
165
166} // namespace
167
168extern "C" {
169int32_t __kmpc_nvptx_parallel_reduce_nowait_v2(IdentTy *Loc,
170 uint64_t reduce_data_size,
171 void *reduce_data,
172 ShuffleReductFnTy shflFct,
173 InterWarpCopyFnTy cpyFct) {
174 return nvptx_parallel_reduce_nowait(reduce_data, shflFct, cpyFct);
175}
176
177int32_t __kmpc_nvptx_teams_reduce_nowait_v2(
178 IdentTy *Loc, void *GlobalBuffer, uint32_t num_of_records,
179 uint64_t reduce_data_size, void *reduce_data, ShuffleReductFnTy shflFct,
180 InterWarpCopyFnTy cpyFct, ListGlobalFnTy lgcpyFct, ListGlobalFnTy lgredFct,
181 ListGlobalFnTy glcpyFct, ListGlobalFnTy glredFct) {
182 // Terminate all threads in non-SPMD mode except for the master thread.
183 uint32_t ThreadId = mapping::getThreadIdInBlock();
184 if (mapping::isGenericMode()) {
185 if (!mapping::isMainThreadInGenericMode())
186 return 0;
187 ThreadId = 0;
188 }
189
190 uint32_t &IterCnt = state::getKernelLaunchEnvironment().ReductionIterCnt;
191 uint32_t &Cnt = state::getKernelLaunchEnvironment().ReductionCnt;
192
193 // In non-generic mode all workers participate in the teams reduction.
194 // In generic mode only the team master participates in the teams
195 // reduction because the workers are waiting for parallel work.
196 uint32_t NumThreads = omp_get_num_threads();
197 uint32_t TeamId = omp_get_team_num();
198 uint32_t NumTeams = omp_get_num_teams();
199 [[clang::loader_uninitialized]] static Local<unsigned> Bound;
200 [[clang::loader_uninitialized]] static Local<unsigned> ChunkTeamCount;
201
202 // Block progress for teams greater than the current upper
203 // limit. We always only allow a number of teams less or equal
204 // to the number of slots in the buffer.
205 bool IsMaster = (ThreadId == 0);
206 while (IsMaster) {
207 Bound = atomic::load(&IterCnt, atomic::acquire);
208 if (TeamId < Bound + num_of_records)
209 break;
210 }
211
212 if (IsMaster) {
213 int ModBockId = TeamId % num_of_records;
214 if (TeamId < num_of_records) {
215 lgcpyFct(GlobalBuffer, ModBockId, reduce_data);
216 } else
217 lgredFct(GlobalBuffer, ModBockId, reduce_data);
218
219 // Propagate the memory writes above to the world.
220 fence::kernel(atomic::release);
221
222 // Increment team counter.
223 // This counter is incremented by all teams in the current
224 // num_of_records chunk.
225 ChunkTeamCount = atomic::inc(&Cnt, num_of_records - 1u, atomic::seq_cst,
226 atomic::MemScopeTy::device);
227 }
228
229 // Synchronize in SPMD mode as in generic mode all but 1 threads are in the
230 // state machine.
231 if (mapping::isSPMDMode())
232 synchronize::threadsAligned(atomic::acq_rel);
233
234 // reduce_data is global or shared so before being reduced within the
235 // warp we need to bring it in local memory:
236 // local_reduce_data = reduce_data[i]
237 //
238 // Example for 3 reduction variables a, b, c (of potentially different
239 // types):
240 //
241 // buffer layout (struct of arrays):
242 // a, a, ..., a, b, b, ... b, c, c, ... c
243 // |__________|
244 // num_of_records
245 //
246 // local_data_reduce layout (struct):
247 // a, b, c
248 //
249 // Each thread will have a local struct containing the values to be
250 // reduced:
251 // 1. do reduction within each warp.
252 // 2. do reduction across warps.
253 // 3. write the final result to the main reduction variable
254 // by returning 1 in the thread holding the reduction result.
255
256 // Check if this is the very last team.
257 unsigned NumRecs = kmpcMin(x: NumTeams, y: uint32_t(num_of_records));
258 if (ChunkTeamCount == NumTeams - Bound - 1) {
259 // Ensure we see the global memory writes by other teams
260 fence::kernel(atomic::acquire);
261
262 //
263 // Last team processing.
264 //
265 if (ThreadId >= NumRecs)
266 return 0;
267 NumThreads = roundToWarpsize(s: kmpcMin(x: NumThreads, y: NumRecs));
268 if (ThreadId >= NumThreads)
269 return 0;
270
271 // Load from buffer and reduce.
272 glcpyFct(GlobalBuffer, ThreadId, reduce_data);
273 for (uint32_t i = NumThreads + ThreadId; i < NumRecs; i += NumThreads)
274 glredFct(GlobalBuffer, i, reduce_data);
275
276 // Reduce across warps to the warp master.
277 if (NumThreads > 1) {
278 gpu_regular_warp_reduce(reduce_data, shflFct);
279
280 // When we have more than [mapping::getWarpSize()] number of threads
281 // a block reduction is performed here.
282 uint32_t ActiveThreads = kmpcMin(x: NumRecs, y: NumThreads);
283 if (ActiveThreads > mapping::getWarpSize()) {
284 uint32_t WarpsNeeded = (ActiveThreads + mapping::getWarpSize() - 1) /
285 mapping::getWarpSize();
286 // Gather all the reduced values from each warp
287 // to the first warp.
288 cpyFct(reduce_data, WarpsNeeded);
289
290 uint32_t WarpId = ThreadId / mapping::getWarpSize();
291 if (WarpId == 0)
292 gpu_irregular_warp_reduce(reduce_data, shflFct, WarpsNeeded,
293 ThreadId);
294 }
295 }
296
297 if (IsMaster) {
298 Cnt = 0;
299 IterCnt = 0;
300 return 1;
301 }
302 return 0;
303 }
304 if (IsMaster && ChunkTeamCount == num_of_records - 1) {
305 // Allow SIZE number of teams to proceed writing their
306 // intermediate results to the global buffer.
307 atomic::add(&IterCnt, uint32_t(num_of_records), atomic::seq_cst);
308 }
309
310 return 0;
311}
312}
313
314void *__kmpc_reduction_get_fixed_buffer() {
315 return state::getKernelLaunchEnvironment().ReductionBuffer;
316}
317

source code of offload/DeviceRTL/src/Reduction.cpp