Reduction.cpp source code [offload/DeviceRTL/src/Reduction.cpp]

1	//===---- Reduction.cpp - OpenMP device reduction implementation - C++ --===//*
2	//
3	// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4	// See https://llvm.org/LICENSE.txt for license information.
5	// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6	//
7	//===----------------------------------------------------------------------===//
8	//
9	// This file contains the implementation of reduction with KMPC interface.
10	//
11	//===----------------------------------------------------------------------===//
12
13	#include "Debug.h"
14	#include "Interface.h"
15	#include "Mapping.h"
16	#include "State.h"
17	#include "Synchronization.h"
18	#include "Types.h"
19	#include "Utils.h"
20
21	using namespace ompx;
22
23	namespace {
24
25	#pragma omp begin declare target device_type(nohost)
26
27	void gpu_regular_warp_reduce(void *reduce_data, ShuffleReductFnTy shflFct) {
28	for (uint32_t mask = mapping::getWarpSize() / `2`; mask > `0`; mask /= `2`) {
29	shflFct(reduce_data, /LaneId - not used= / `0`,
30	/Offset = / mask, /AlgoVersion=/`0`);
31	}
32	}
33
34	void gpu_irregular_warp_reduce(void *reduce_data, ShuffleReductFnTy shflFct,
35	uint32_t size, uint32_t tid) {
36	uint32_t curr_size;
37	uint32_t mask;
38	curr_size = size;
39	mask = curr_size / `2`;
40	while (mask > `0`) {
41	shflFct(reduce_data, /LaneId = / tid, /Offset=/mask, /AlgoVersion=/`1`);
42	curr_size = (curr_size + `1`) / `2`;
43	mask = curr_size / `2`;
44	}
45	}
46
47	#if !defined(__CUDA_ARCH__) \|\| __CUDA_ARCH__ < 700
48	static uint32_t gpu_irregular_simd_reduce(void *reduce_data,
49	ShuffleReductFnTy shflFct) {
50	uint32_t size, remote_id, physical_lane_id;
51	physical_lane_id = mapping::getThreadIdInBlock() % mapping::getWarpSize();
52	__kmpc_impl_lanemask_t lanemask_lt = mapping::lanemaskLT();
53	__kmpc_impl_lanemask_t Liveness = mapping::activemask();
54	uint32_t logical_lane_id = utils::popc(Liveness & lanemask_lt) * `2`;
55	__kmpc_impl_lanemask_t lanemask_gt = mapping::lanemaskGT();
56	do {
57	Liveness = mapping::activemask();
58	remote_id = utils::ffs(Liveness & lanemask_gt);
59	size = utils::popc(Liveness);
60	logical_lane_id /= `2`;
61	shflFct(reduce_data, /LaneId =/logical_lane_id,
62	/Offset=/remote_id - `1` - physical_lane_id, /AlgoVersion=/`2`);
63	} while (logical_lane_id % `2` == `0` && size > `1`);
64	return (logical_lane_id == `0`);
65	}
66	#endif
67
68	static int32_t nvptx_parallel_reduce_nowait(void *reduce_data,
69	ShuffleReductFnTy shflFct,
70	InterWarpCopyFnTy cpyFct) {
71	uint32_t BlockThreadId = mapping::getThreadIdInBlock();
72	if (mapping::isMainThreadInGenericMode(/IsSPMD=/false))
73	BlockThreadId = `0`;
74	uint32_t NumThreads = omp_get_num_threads();
75	if (NumThreads == `1`)
76	return `1`;
77	/*
78	* This reduce function handles reduction within a team. It handles
79	* parallel regions in both L1 and L2 parallelism levels. It also
80	* supports Generic, SPMD, and NoOMP modes.
81	*
82	* 1. Reduce within a warp.
83	* 2. Warp master copies value to warp 0 via shared memory.
84	* 3. Warp 0 reduces to a single value.
85	* 4. The reduced value is available in the thread that returns 1.
86	*/
87
88	#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 700
89	uint32_t WarpsNeeded =
90	(NumThreads + mapping::getWarpSize() - `1`) / mapping::getWarpSize();
91	uint32_t WarpId = mapping::getWarpIdInBlock();
92
93	// Volta execution model:
94	// For the Generic execution mode a parallel region either has 1 thread and
95	// beyond that, always a multiple of 32. For the SPMD execution mode we may
96	// have any number of threads.
97	if ((NumThreads % mapping::getWarpSize() == `0`) \|\| (WarpId < WarpsNeeded - `1`))
98	gpu_regular_warp_reduce(reduce_data, shflFct);
99	else if (NumThreads > `1`) // Only SPMD execution mode comes thru this case.
100	gpu_irregular_warp_reduce(reduce_data, shflFct,
101	/LaneCount=/NumThreads % mapping::getWarpSize(),
102	/LaneId=/mapping::getThreadIdInBlock() %
103	mapping::getWarpSize());
104
105	// When we have more than [mapping::getWarpSize()] number of threads
106	// a block reduction is performed here.
107	//
108	// Only L1 parallel region can enter this if condition.
109	if (NumThreads > mapping::getWarpSize()) {
110	// Gather all the reduced values from each warp
111	// to the first warp.
112	cpyFct(reduce_data, WarpsNeeded);
113
114	if (WarpId == `0`)
115	gpu_irregular_warp_reduce(reduce_data, shflFct, WarpsNeeded,
116	BlockThreadId);
117	}
118	return BlockThreadId == `0`;
119	#else
120	__kmpc_impl_lanemask_t Liveness = mapping::activemask();
121	if (Liveness == lanes::All) // Full warp
122	gpu_regular_warp_reduce(reduce_data, shflFct);
123	else if (!(Liveness & (Liveness + `1`))) // Partial warp but contiguous lanes
124	gpu_irregular_warp_reduce(reduce_data, shflFct,
125	/LaneCount=/utils::popc(Liveness),
126	/LaneId=/mapping::getThreadIdInBlock() %
127	mapping::getWarpSize());
128	else { // Dispersed lanes. Only threads in L2
129	// parallel region may enter here; return
130	// early.
131	return gpu_irregular_simd_reduce(reduce_data, shflFct);
132	}
133
134	// When we have more than [mapping::getWarpSize()] number of threads
135	// a block reduction is performed here.
136	//
137	// Only L1 parallel region can enter this if condition.
138	if (NumThreads > mapping::getWarpSize()) {
139	uint32_t WarpsNeeded =
140	(NumThreads + mapping::getWarpSize() - `1`) / mapping::getWarpSize();
141	// Gather all the reduced values from each warp
142	// to the first warp.
143	cpyFct(reduce_data, WarpsNeeded);
144
145	uint32_t WarpId = BlockThreadId / mapping::getWarpSize();
146	if (WarpId == `0`)
147	gpu_irregular_warp_reduce(reduce_data, shflFct, WarpsNeeded,
148	BlockThreadId);
149
150	return BlockThreadId == `0`;
151	}
152
153	// Get the OMP thread Id. This is different from BlockThreadId in the case of
154	// an L2 parallel region.
155	return BlockThreadId == `0`;
156	#endif // __CUDA_ARCH__ >= 700
157	}
158
159	uint32_t roundToWarpsize(uint32_t s) {
160	if (s < mapping::getWarpSize())
161	return `1`;
162	return (s & ~(unsigned)(mapping::getWarpSize() - `1`));
163	}
164
165	uint32_t kmpcMin(uint32_t x, uint32_t y) { return x < y ? x : y; }
166
167	} // namespace
168
169	extern "C" {
170	int32_t __kmpc_nvptx_parallel_reduce_nowait_v2(IdentTy *Loc,
171	uint64_t reduce_data_size,
172	void *reduce_data,
173	ShuffleReductFnTy shflFct,
174	InterWarpCopyFnTy cpyFct) {
175	return nvptx_parallel_reduce_nowait(reduce_data, shflFct, cpyFct);
176	}
177
178	int32_t __kmpc_nvptx_teams_reduce_nowait_v2(
179	IdentTy Loc, void* *GlobalBuffer, uint32_t num_of_records,
180	uint64_t reduce_data_size, void *reduce_data, ShuffleReductFnTy shflFct,
181	InterWarpCopyFnTy cpyFct, ListGlobalFnTy lgcpyFct, ListGlobalFnTy lgredFct,
182	ListGlobalFnTy glcpyFct, ListGlobalFnTy glredFct) {
183	// Terminate all threads in non-SPMD mode except for the master thread.
184	uint32_t ThreadId = mapping::getThreadIdInBlock();
185	if (mapping::isGenericMode()) {
186	if (!mapping::isMainThreadInGenericMode())
187	return `0`;
188	ThreadId = `0`;
189	}
190
191	uint32_t &IterCnt = state::getKernelLaunchEnvironment().ReductionIterCnt;
192	uint32_t &Cnt = state::getKernelLaunchEnvironment().ReductionCnt;
193
194	// In non-generic mode all workers participate in the teams reduction.
195	// In generic mode only the team master participates in the teams
196	// reduction because the workers are waiting for parallel work.
197	uint32_t NumThreads = omp_get_num_threads();
198	uint32_t TeamId = omp_get_team_num();
199	uint32_t NumTeams = omp_get_num_teams();
200	static unsigned SHARED(Bound);
201	static unsigned SHARED(ChunkTeamCount);
202
203	// Block progress for teams greater than the current upper
204	// limit. We always only allow a number of teams less or equal
205	// to the number of slots in the buffer.
206	bool IsMaster = (ThreadId == `0`);
207	while (IsMaster) {
208	Bound = atomic::load(&IterCnt, atomic::aquire);
209	if (TeamId < Bound + num_of_records)
210	break;
211	}
212
213	if (IsMaster) {
214	int ModBockId = TeamId % num_of_records;
215	if (TeamId < num_of_records) {
216	lgcpyFct(GlobalBuffer, ModBockId, reduce_data);
217	} else
218	lgredFct(GlobalBuffer, ModBockId, reduce_data);
219
220	// Propagate the memory writes above to the world.
221	fence::kernel(atomic::release);
222
223	// Increment team counter.
224	// This counter is incremented by all teams in the current
225	// num_of_records chunk.
226	ChunkTeamCount = atomic::inc(&Cnt, num_of_records - `1u`, atomic::seq_cst,
227	atomic::MemScopeTy::device);
228	}
229
230	// Synchronize in SPMD mode as in generic mode all but 1 threads are in the
231	// state machine.
232	if (mapping::isSPMDMode())
233	synchronize::threadsAligned(atomic::acq_rel);
234
235	// reduce_data is global or shared so before being reduced within the
236	// warp we need to bring it in local memory:
237	// local_reduce_data = reduce_data[i]
238	//
239	// Example for 3 reduction variables a, b, c (of potentially different
240	// types):
241	//
242	// buffer layout (struct of arrays):
243	// a, a, ..., a, b, b, ... b, c, c, ... c
244	// \|__________\|
245	// num_of_records
246	//
247	// local_data_reduce layout (struct):
248	// a, b, c
249	//
250	// Each thread will have a local struct containing the values to be
251	// reduced:
252	// 1. do reduction within each warp.
253	// 2. do reduction across warps.
254	// 3. write the final result to the main reduction variable
255	// by returning 1 in the thread holding the reduction result.
256
257	// Check if this is the very last team.
258	unsigned NumRecs = kmpcMin(x: NumTeams, y: uint32_t(num_of_records));
259	if (ChunkTeamCount == NumTeams - Bound - `1`) {
260	// Ensure we see the global memory writes by other teams
261	fence::kernel(atomic::aquire);
262
263	//
264	// Last team processing.
265	//
266	if (ThreadId >= NumRecs)
267	return `0`;
268	NumThreads = roundToWarpsize(s: kmpcMin(x: NumThreads, y: NumRecs));
269	if (ThreadId >= NumThreads)
270	return `0`;
271
272	// Load from buffer and reduce.
273	glcpyFct(GlobalBuffer, ThreadId, reduce_data);
274	for (uint32_t i = NumThreads + ThreadId; i < NumRecs; i += NumThreads)
275	glredFct(GlobalBuffer, i, reduce_data);
276
277	// Reduce across warps to the warp master.
278	if (NumThreads > `1`) {
279	gpu_regular_warp_reduce(reduce_data, shflFct);
280
281	// When we have more than [mapping::getWarpSize()] number of threads
282	// a block reduction is performed here.
283	uint32_t ActiveThreads = kmpcMin(x: NumRecs, y: NumThreads);
284	if (ActiveThreads > mapping::getWarpSize()) {
285	uint32_t WarpsNeeded = (ActiveThreads + mapping::getWarpSize() - `1`) /
286	mapping::getWarpSize();
287	// Gather all the reduced values from each warp
288	// to the first warp.
289	cpyFct(reduce_data, WarpsNeeded);
290
291	uint32_t WarpId = ThreadId / mapping::getWarpSize();
292	if (WarpId == `0`)
293	gpu_irregular_warp_reduce(reduce_data, shflFct, WarpsNeeded,
294	ThreadId);
295	}
296	}
297
298	if (IsMaster) {
299	Cnt = `0`;
300	IterCnt = `0`;
301	return `1`;
302	}
303	return `0`;
304	}
305	if (IsMaster && ChunkTeamCount == num_of_records - `1`) {
306	// Allow SIZE number of teams to proceed writing their
307	// intermediate results to the global buffer.
308	atomic::add(&IterCnt, uint32_t(num_of_records), atomic::seq_cst);
309	}
310
311	return `0`;
312	}
313	}
314
315	void *__kmpc_reduction_get_fixed_buffer() {
316	return state::getKernelLaunchEnvironment().ReductionBuffer;
317	}
318
319	#pragma omp end declare target
320

source code of offload/DeviceRTL/src/Reduction.cpp