Mapping.cpp source code [offload/DeviceRTL/src/Mapping.cpp]

1	//===------- Mapping.cpp - OpenMP device runtime mapping helpers -- C++ --===//*
2	//
3	// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4	// See https://llvm.org/LICENSE.txt for license information.
5	// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6	//
7	//===----------------------------------------------------------------------===//
8	//
9	//
10	//===----------------------------------------------------------------------===//
11
12	#include "Mapping.h"
13	#include "DeviceTypes.h"
14	#include "DeviceUtils.h"
15	#include "Interface.h"
16	#include "State.h"
17	#include "gpuintrin.h"
18
19	using namespace ompx;
20
21	// FIXME: This resolves the handling for the AMDGPU workgroup size when the ABI
22	// is set to 'none'. We only support COV5+ but this can be removed when COV4 is
23	// fully deprecated.
24	#ifdef __AMDGPU__
25	extern const inline uint32_t __oclc_ABI_version = `500`;
26	[[gnu::alias("__oclc_ABI_version")]] const uint32_t __oclc_ABI_version__;
27	#endif
28
29	static bool isInLastWarp() {
30	uint32_t MainTId = (mapping::getNumberOfThreadsInBlock() - `1`) &
31	~(mapping::getWarpSize() - `1`);
32	return mapping::getThreadIdInBlock() == MainTId;
33	}
34
35	bool mapping::isMainThreadInGenericMode(bool IsSPMD) {
36	if (IsSPMD \|\| icv::Level)
37	return false;
38
39	// Check if this is the last warp in the block.
40	return isInLastWarp();
41	}
42
43	bool mapping::isMainThreadInGenericMode() {
44	return mapping::isMainThreadInGenericMode(mapping::isSPMDMode());
45	}
46
47	bool mapping::isInitialThreadInLevel0(bool IsSPMD) {
48	if (IsSPMD)
49	return mapping::getThreadIdInBlock() == `0`;
50	return isInLastWarp();
51	}
52
53	bool mapping::isLeaderInWarp() {
54	__kmpc_impl_lanemask_t Active = mapping::activemask();
55	__kmpc_impl_lanemask_t LaneMaskLT = mapping::lanemaskLT();
56	return utils::popc(Active & LaneMaskLT) == `0`;
57	}
58
59	LaneMaskTy mapping::activemask() { return __gpu_lane_mask(); }
60
61	LaneMaskTy mapping::lanemaskLT() {
62	#ifdef __NVPTX__
63	return __nvvm_read_ptx_sreg_lanemask_lt();
64	#else
65	uint32_t Lane = mapping::getThreadIdInWarp();
66	int64_t Ballot = mapping::activemask();
67	uint64_t Mask = ((uint64_t)`1` << Lane) - (uint64_t)`1`;
68	return Mask & Ballot;
69	#endif
70	}
71
72	LaneMaskTy mapping::lanemaskGT() {
73	#ifdef __NVPTX__
74	return __nvvm_read_ptx_sreg_lanemask_gt();
75	#else
76	uint32_t Lane = mapping::getThreadIdInWarp();
77	if (Lane == (mapping::getWarpSize() - `1`))
78	return `0`;
79	int64_t Ballot = mapping::activemask();
80	uint64_t Mask = (~((uint64_t)`0`)) << (Lane + `1`);
81	return Mask & Ballot;
82	#endif
83	}
84
85	uint32_t mapping::getThreadIdInWarp() {
86	uint32_t ThreadIdInWarp = __gpu_lane_id();
87	ASSERT(ThreadIdInWarp < mapping::getWarpSize(), nullptr);
88	return ThreadIdInWarp;
89	}
90
91	uint32_t mapping::getThreadIdInBlock(int32_t Dim) {
92	uint32_t ThreadIdInBlock = __gpu_thread_id(Dim);
93	return ThreadIdInBlock;
94	}
95
96	uint32_t mapping::getWarpSize() { return __gpu_num_lanes(); }
97
98	uint32_t mapping::getMaxTeamThreads(bool IsSPMD) {
99	uint32_t BlockSize = mapping::getNumberOfThreadsInBlock();
100	// If we are in SPMD mode, remove one warp.
101	return BlockSize - (!IsSPMD * mapping::getWarpSize());
102	}
103	uint32_t mapping::getMaxTeamThreads() {
104	return mapping::getMaxTeamThreads(mapping::isSPMDMode());
105	}
106
107	uint32_t mapping::getNumberOfThreadsInBlock(int32_t Dim) {
108	return __gpu_num_threads(Dim);
109	}
110
111	uint32_t mapping::getNumberOfThreadsInKernel() {
112	return mapping::getNumberOfThreadsInBlock(`0`) *
113	mapping::getNumberOfBlocksInKernel(`0`) *
114	mapping::getNumberOfThreadsInBlock(`1`) *
115	mapping::getNumberOfBlocksInKernel(`1`) *
116	mapping::getNumberOfThreadsInBlock(`2`) *
117	mapping::getNumberOfBlocksInKernel(`2`);
118	}
119
120	uint32_t mapping::getWarpIdInBlock() {
121	uint32_t WarpID =
122	mapping::getThreadIdInBlock(mapping::DIM_X) / mapping::getWarpSize();
123	ASSERT(WarpID < mapping::getNumberOfWarpsInBlock(), nullptr);
124	return WarpID;
125	}
126
127	uint32_t mapping::getBlockIdInKernel(int32_t Dim) {
128	uint32_t BlockId = __gpu_block_id(Dim);
129	ASSERT(BlockId < mapping::getNumberOfBlocksInKernel(Dim), nullptr);
130	return BlockId;
131	}
132
133	uint32_t mapping::getNumberOfWarpsInBlock() {
134	return (mapping::getNumberOfThreadsInBlock() + mapping::getWarpSize() - `1`) /
135	mapping::getWarpSize();
136	}
137
138	uint32_t mapping::getNumberOfBlocksInKernel(int32_t Dim) {
139	return __gpu_num_blocks(Dim);
140	}
141
142	uint32_t mapping::getNumberOfProcessorElements() {
143	return static_cast<uint32_t>(config::getHardwareParallelism());
144	}
145
146	///}
147
148	/// Execution mode
149	///
150	///{
151
152	// TODO: This is a workaround for initialization coming from kernels outside of
153	// the TU. We will need to solve this more correctly in the future.
154	[[gnu::weak, clang::loader_uninitialized]] Local<int> IsSPMDMode;
155
156	void mapping::init(bool IsSPMD) {
157	if (mapping::isInitialThreadInLevel0(IsSPMD))
158	IsSPMDMode = IsSPMD;
159	}
160
161	bool mapping::isSPMDMode() { return IsSPMDMode; }
162
163	bool mapping::isGenericMode() { return !isSPMDMode(); }
164	///}
165
166	extern "C" {
167	[[gnu::noinline]] uint32_t __kmpc_get_hardware_thread_id_in_block() {
168	return mapping::getThreadIdInBlock();
169	}
170
171	[[gnu::noinline]] uint32_t __kmpc_get_hardware_num_threads_in_block() {
172	return mapping::getNumberOfThreadsInBlock(mapping::DIM_X);
173	}
174
175	[[gnu::noinline]] uint32_t __kmpc_get_warp_size() {
176	return mapping::getWarpSize();
177	}
178	}
179
180	#define _TGT_KERNEL_LANGUAGE(NAME, MAPPER_NAME) \
181	extern "C" int ompx_##NAME(int Dim) { return mapping::MAPPER_NAME(Dim); }
182
183	_TGT_KERNEL_LANGUAGE(thread_id, getThreadIdInBlock)
184	_TGT_KERNEL_LANGUAGE(block_id, getBlockIdInKernel)
185	_TGT_KERNEL_LANGUAGE(block_dim, getNumberOfThreadsInBlock)
186	_TGT_KERNEL_LANGUAGE(grid_dim, getNumberOfBlocksInKernel)
187
188	extern "C" {
189	uint64_t ompx_ballot_sync(uint64_t mask, int pred) {
190	return utils::ballotSync(mask, pred);
191	}
192
193	int ompx_shfl_down_sync_i(uint64_t mask, int var, unsigned delta, int width) {
194	return utils::shuffleDown(mask, var, delta, width);
195	}
196
197	float ompx_shfl_down_sync_f(uint64_t mask, float var, unsigned delta,
198	int width) {
199	return utils::bitCast<float>(
200	utils::shuffleDown(mask, utils::bitCast<int32_t>(var), delta, width));
201	}
202
203	long ompx_shfl_down_sync_l(uint64_t mask, long var, unsigned delta, int width) {
204	return utils::shuffleDown(mask, utils::bitCast<int64_t>(var), delta, width);
205	}
206
207	double ompx_shfl_down_sync_d(uint64_t mask, double var, unsigned delta,
208	int width) {
209	return utils::bitCast<double>(
210	utils::shuffleDown(mask, utils::bitCast<int64_t>(var), delta, width));
211	}
212	}
213

source code of offload/DeviceRTL/src/Mapping.cpp