1//===------- Mapping.cpp - OpenMP device runtime mapping helpers -- C++ -*-===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9//
10//===----------------------------------------------------------------------===//
11
12#include "Mapping.h"
13#include "Interface.h"
14#include "State.h"
15#include "Types.h"
16#include "Utils.h"
17
18#pragma omp begin declare target device_type(nohost)
19
20#include "llvm/Frontend/OpenMP/OMPGridValues.h"
21
22using namespace ompx;
23
24namespace ompx {
25namespace impl {
26
27// Forward declarations defined to be defined for AMDGCN and NVPTX.
28const llvm::omp::GV &getGridValue();
29LaneMaskTy activemask();
30LaneMaskTy lanemaskLT();
31LaneMaskTy lanemaskGT();
32uint32_t getThreadIdInWarp();
33uint32_t getThreadIdInBlock(int32_t Dim);
34uint32_t getNumberOfThreadsInBlock(int32_t Dim);
35uint32_t getNumberOfThreadsInKernel();
36uint32_t getBlockIdInKernel(int32_t Dim);
37uint32_t getNumberOfBlocksInKernel(int32_t Dim);
38uint32_t getWarpIdInBlock();
39uint32_t getNumberOfWarpsInBlock();
40
41/// AMDGCN Implementation
42///
43///{
44#pragma omp begin declare variant match(device = {arch(amdgcn)})
45
46const llvm::omp::GV &getGridValue() {
47 return llvm::omp::getAMDGPUGridValues<__AMDGCN_WAVEFRONT_SIZE>();
48}
49
50uint32_t getNumberOfThreadsInBlock(int32_t Dim) {
51 switch (Dim) {
52 case 0:
53 return __builtin_amdgcn_workgroup_size_x();
54 case 1:
55 return __builtin_amdgcn_workgroup_size_y();
56 case 2:
57 return __builtin_amdgcn_workgroup_size_z();
58 };
59 UNREACHABLE("Dim outside range!");
60}
61
62LaneMaskTy activemask() { return __builtin_amdgcn_read_exec(); }
63
64LaneMaskTy lanemaskLT() {
65 uint32_t Lane = mapping::getThreadIdInWarp();
66 int64_t Ballot = mapping::activemask();
67 uint64_t Mask = ((uint64_t)1 << Lane) - (uint64_t)1;
68 return Mask & Ballot;
69}
70
71LaneMaskTy lanemaskGT() {
72 uint32_t Lane = mapping::getThreadIdInWarp();
73 if (Lane == (mapping::getWarpSize() - 1))
74 return 0;
75 int64_t Ballot = mapping::activemask();
76 uint64_t Mask = (~((uint64_t)0)) << (Lane + 1);
77 return Mask & Ballot;
78}
79
80uint32_t getThreadIdInWarp() {
81 return __builtin_amdgcn_mbcnt_hi(~0u, __builtin_amdgcn_mbcnt_lo(~0u, 0u));
82}
83
84uint32_t getThreadIdInBlock(int32_t Dim) {
85 switch (Dim) {
86 case 0:
87 return __builtin_amdgcn_workitem_id_x();
88 case 1:
89 return __builtin_amdgcn_workitem_id_y();
90 case 2:
91 return __builtin_amdgcn_workitem_id_z();
92 };
93 UNREACHABLE("Dim outside range!");
94}
95
96uint32_t getNumberOfThreadsInKernel() {
97 return __builtin_amdgcn_grid_size_x() * __builtin_amdgcn_grid_size_y() *
98 __builtin_amdgcn_grid_size_z();
99}
100
101uint32_t getBlockIdInKernel(int32_t Dim) {
102 switch (Dim) {
103 case 0:
104 return __builtin_amdgcn_workgroup_id_x();
105 case 1:
106 return __builtin_amdgcn_workgroup_id_y();
107 case 2:
108 return __builtin_amdgcn_workgroup_id_z();
109 };
110 UNREACHABLE("Dim outside range!");
111}
112
113uint32_t getNumberOfBlocksInKernel(int32_t Dim) {
114 switch (Dim) {
115 case 0:
116 return __builtin_amdgcn_grid_size_x() / __builtin_amdgcn_workgroup_size_x();
117 case 1:
118 return __builtin_amdgcn_grid_size_y() / __builtin_amdgcn_workgroup_size_y();
119 case 2:
120 return __builtin_amdgcn_grid_size_z() / __builtin_amdgcn_workgroup_size_z();
121 };
122 UNREACHABLE("Dim outside range!");
123}
124
125uint32_t getWarpIdInBlock() {
126 return impl::getThreadIdInBlock(mapping::DIM_X) / mapping::getWarpSize();
127}
128
129uint32_t getNumberOfWarpsInBlock() {
130 return mapping::getNumberOfThreadsInBlock() / mapping::getWarpSize();
131}
132
133#pragma omp end declare variant
134///}
135
136/// NVPTX Implementation
137///
138///{
139#pragma omp begin declare variant match( \
140 device = {arch(nvptx, nvptx64)}, \
141 implementation = {extension(match_any)})
142
143uint32_t getNumberOfThreadsInBlock(int32_t Dim) {
144 switch (Dim) {
145 case 0:
146 return __nvvm_read_ptx_sreg_ntid_x();
147 case 1:
148 return __nvvm_read_ptx_sreg_ntid_y();
149 case 2:
150 return __nvvm_read_ptx_sreg_ntid_z();
151 };
152 UNREACHABLE("Dim outside range!");
153}
154
155const llvm::omp::GV &getGridValue() { return llvm::omp::NVPTXGridValues; }
156
157LaneMaskTy activemask() { return __nvvm_activemask(); }
158
159LaneMaskTy lanemaskLT() { return __nvvm_read_ptx_sreg_lanemask_lt(); }
160
161LaneMaskTy lanemaskGT() { return __nvvm_read_ptx_sreg_lanemask_gt(); }
162
163uint32_t getThreadIdInBlock(int32_t Dim) {
164 switch (Dim) {
165 case 0:
166 return __nvvm_read_ptx_sreg_tid_x();
167 case 1:
168 return __nvvm_read_ptx_sreg_tid_y();
169 case 2:
170 return __nvvm_read_ptx_sreg_tid_z();
171 };
172 UNREACHABLE("Dim outside range!");
173}
174
175uint32_t getThreadIdInWarp() { return __nvvm_read_ptx_sreg_laneid(); }
176
177uint32_t getBlockIdInKernel(int32_t Dim) {
178 switch (Dim) {
179 case 0:
180 return __nvvm_read_ptx_sreg_ctaid_x();
181 case 1:
182 return __nvvm_read_ptx_sreg_ctaid_y();
183 case 2:
184 return __nvvm_read_ptx_sreg_ctaid_z();
185 };
186 UNREACHABLE("Dim outside range!");
187}
188
189uint32_t getNumberOfBlocksInKernel(int32_t Dim) {
190 switch (Dim) {
191 case 0:
192 return __nvvm_read_ptx_sreg_nctaid_x();
193 case 1:
194 return __nvvm_read_ptx_sreg_nctaid_y();
195 case 2:
196 return __nvvm_read_ptx_sreg_nctaid_z();
197 };
198 UNREACHABLE("Dim outside range!");
199}
200
201uint32_t getNumberOfThreadsInKernel() {
202 return impl::getNumberOfThreadsInBlock(0) *
203 impl::getNumberOfBlocksInKernel(0) *
204 impl::getNumberOfThreadsInBlock(1) *
205 impl::getNumberOfBlocksInKernel(1) *
206 impl::getNumberOfThreadsInBlock(2) *
207 impl::getNumberOfBlocksInKernel(2);
208}
209
210uint32_t getWarpIdInBlock() {
211 return impl::getThreadIdInBlock(mapping::DIM_X) / mapping::getWarpSize();
212}
213
214uint32_t getNumberOfWarpsInBlock() {
215 return (mapping::getNumberOfThreadsInBlock() + mapping::getWarpSize() - 1) /
216 mapping::getWarpSize();
217}
218
219#pragma omp end declare variant
220///}
221
222uint32_t getWarpSize() { return getGridValue().GV_Warp_Size; }
223
224} // namespace impl
225} // namespace ompx
226
227/// We have to be deliberate about the distinction of `mapping::` and `impl::`
228/// below to avoid repeating assumptions or including irrelevant ones.
229///{
230
231static bool isInLastWarp() {
232 uint32_t MainTId = (mapping::getNumberOfThreadsInBlock() - 1) &
233 ~(mapping::getWarpSize() - 1);
234 return mapping::getThreadIdInBlock() == MainTId;
235}
236
237bool mapping::isMainThreadInGenericMode(bool IsSPMD) {
238 if (IsSPMD || icv::Level)
239 return false;
240
241 // Check if this is the last warp in the block.
242 return isInLastWarp();
243}
244
245bool mapping::isMainThreadInGenericMode() {
246 return mapping::isMainThreadInGenericMode(mapping::isSPMDMode());
247}
248
249bool mapping::isInitialThreadInLevel0(bool IsSPMD) {
250 if (IsSPMD)
251 return mapping::getThreadIdInBlock() == 0;
252 return isInLastWarp();
253}
254
255bool mapping::isLeaderInWarp() {
256 __kmpc_impl_lanemask_t Active = mapping::activemask();
257 __kmpc_impl_lanemask_t LaneMaskLT = mapping::lanemaskLT();
258 return utils::popc(Active & LaneMaskLT) == 0;
259}
260
261LaneMaskTy mapping::activemask() { return impl::activemask(); }
262
263LaneMaskTy mapping::lanemaskLT() { return impl::lanemaskLT(); }
264
265LaneMaskTy mapping::lanemaskGT() { return impl::lanemaskGT(); }
266
267uint32_t mapping::getThreadIdInWarp() {
268 uint32_t ThreadIdInWarp = impl::getThreadIdInWarp();
269 ASSERT(ThreadIdInWarp < impl::getWarpSize(), nullptr);
270 return ThreadIdInWarp;
271}
272
273uint32_t mapping::getThreadIdInBlock(int32_t Dim) {
274 uint32_t ThreadIdInBlock = impl::getThreadIdInBlock(Dim);
275 return ThreadIdInBlock;
276}
277
278uint32_t mapping::getWarpSize() { return impl::getWarpSize(); }
279
280uint32_t mapping::getMaxTeamThreads(bool IsSPMD) {
281 uint32_t BlockSize = mapping::getNumberOfThreadsInBlock();
282 // If we are in SPMD mode, remove one warp.
283 return BlockSize - (!IsSPMD * impl::getWarpSize());
284}
285uint32_t mapping::getMaxTeamThreads() {
286 return mapping::getMaxTeamThreads(mapping::isSPMDMode());
287}
288
289uint32_t mapping::getNumberOfThreadsInBlock(int32_t Dim) {
290 return impl::getNumberOfThreadsInBlock(Dim);
291}
292
293uint32_t mapping::getNumberOfThreadsInKernel() {
294 return impl::getNumberOfThreadsInKernel();
295}
296
297uint32_t mapping::getWarpIdInBlock() {
298 uint32_t WarpID = impl::getWarpIdInBlock();
299 ASSERT(WarpID < impl::getNumberOfWarpsInBlock(), nullptr);
300 return WarpID;
301}
302
303uint32_t mapping::getBlockIdInKernel(int32_t Dim) {
304 uint32_t BlockId = impl::getBlockIdInKernel(Dim);
305 ASSERT(BlockId < impl::getNumberOfBlocksInKernel(Dim), nullptr);
306 return BlockId;
307}
308
309uint32_t mapping::getNumberOfWarpsInBlock() {
310 uint32_t NumberOfWarpsInBlocks = impl::getNumberOfWarpsInBlock();
311 ASSERT(impl::getWarpIdInBlock() < NumberOfWarpsInBlocks, nullptr);
312 return NumberOfWarpsInBlocks;
313}
314
315uint32_t mapping::getNumberOfBlocksInKernel(int32_t Dim) {
316 uint32_t NumberOfBlocks = impl::getNumberOfBlocksInKernel(Dim);
317 ASSERT(impl::getBlockIdInKernel(Dim) < NumberOfBlocks, nullptr);
318 return NumberOfBlocks;
319}
320
321uint32_t mapping::getNumberOfProcessorElements() {
322 return static_cast<uint32_t>(config::getHardwareParallelism());
323}
324
325///}
326
327/// Execution mode
328///
329///{
330
331// TODO: This is a workaround for initialization coming from kernels outside of
332// the TU. We will need to solve this more correctly in the future.
333[[gnu::weak]] int SHARED(IsSPMDMode);
334
335void mapping::init(bool IsSPMD) {
336 if (mapping::isInitialThreadInLevel0(IsSPMD))
337 IsSPMDMode = IsSPMD;
338}
339
340bool mapping::isSPMDMode() { return IsSPMDMode; }
341
342bool mapping::isGenericMode() { return !isSPMDMode(); }
343///}
344
345extern "C" {
346[[gnu::noinline]] uint32_t __kmpc_get_hardware_thread_id_in_block() {
347 return mapping::getThreadIdInBlock();
348}
349
350[[gnu::noinline]] uint32_t __kmpc_get_hardware_num_threads_in_block() {
351 return impl::getNumberOfThreadsInBlock(mapping::DIM_X);
352}
353
354[[gnu::noinline]] uint32_t __kmpc_get_warp_size() {
355 return impl::getWarpSize();
356}
357}
358
359#define _TGT_KERNEL_LANGUAGE(NAME, MAPPER_NAME) \
360 extern "C" int ompx_##NAME(int Dim) { return mapping::MAPPER_NAME(Dim); }
361
362_TGT_KERNEL_LANGUAGE(thread_id, getThreadIdInBlock)
363_TGT_KERNEL_LANGUAGE(block_id, getBlockIdInKernel)
364_TGT_KERNEL_LANGUAGE(block_dim, getNumberOfThreadsInBlock)
365_TGT_KERNEL_LANGUAGE(grid_dim, getNumberOfBlocksInKernel)
366
367#pragma omp end declare target
368

source code of offload/DeviceRTL/src/Mapping.cpp