1//===------- Mapping.cpp - OpenMP device runtime mapping helpers -- C++ -*-===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9//
10//===----------------------------------------------------------------------===//
11
12#include "Mapping.h"
13#include "DeviceTypes.h"
14#include "DeviceUtils.h"
15#include "Interface.h"
16#include "State.h"
17#include "gpuintrin.h"
18
19using namespace ompx;
20
21// FIXME: This resolves the handling for the AMDGPU workgroup size when the ABI
22// is set to 'none'. We only support COV5+ but this can be removed when COV4 is
23// fully deprecated.
24#ifdef __AMDGPU__
25extern const inline uint32_t __oclc_ABI_version = 500;
26[[gnu::alias("__oclc_ABI_version")]] const uint32_t __oclc_ABI_version__;
27#endif
28
29static bool isInLastWarp() {
30 uint32_t MainTId = (mapping::getNumberOfThreadsInBlock() - 1) &
31 ~(mapping::getWarpSize() - 1);
32 return mapping::getThreadIdInBlock() == MainTId;
33}
34
35bool mapping::isMainThreadInGenericMode(bool IsSPMD) {
36 if (IsSPMD || icv::Level)
37 return false;
38
39 // Check if this is the last warp in the block.
40 return isInLastWarp();
41}
42
43bool mapping::isMainThreadInGenericMode() {
44 return mapping::isMainThreadInGenericMode(mapping::isSPMDMode());
45}
46
47bool mapping::isInitialThreadInLevel0(bool IsSPMD) {
48 if (IsSPMD)
49 return mapping::getThreadIdInBlock() == 0;
50 return isInLastWarp();
51}
52
53bool mapping::isLeaderInWarp() {
54 __kmpc_impl_lanemask_t Active = mapping::activemask();
55 __kmpc_impl_lanemask_t LaneMaskLT = mapping::lanemaskLT();
56 return utils::popc(Active & LaneMaskLT) == 0;
57}
58
59LaneMaskTy mapping::activemask() { return __gpu_lane_mask(); }
60
61LaneMaskTy mapping::lanemaskLT() {
62#ifdef __NVPTX__
63 return __nvvm_read_ptx_sreg_lanemask_lt();
64#else
65 uint32_t Lane = mapping::getThreadIdInWarp();
66 int64_t Ballot = mapping::activemask();
67 uint64_t Mask = ((uint64_t)1 << Lane) - (uint64_t)1;
68 return Mask & Ballot;
69#endif
70}
71
72LaneMaskTy mapping::lanemaskGT() {
73#ifdef __NVPTX__
74 return __nvvm_read_ptx_sreg_lanemask_gt();
75#else
76 uint32_t Lane = mapping::getThreadIdInWarp();
77 if (Lane == (mapping::getWarpSize() - 1))
78 return 0;
79 int64_t Ballot = mapping::activemask();
80 uint64_t Mask = (~((uint64_t)0)) << (Lane + 1);
81 return Mask & Ballot;
82#endif
83}
84
85uint32_t mapping::getThreadIdInWarp() {
86 uint32_t ThreadIdInWarp = __gpu_lane_id();
87 ASSERT(ThreadIdInWarp < mapping::getWarpSize(), nullptr);
88 return ThreadIdInWarp;
89}
90
91uint32_t mapping::getThreadIdInBlock(int32_t Dim) {
92 uint32_t ThreadIdInBlock = __gpu_thread_id(Dim);
93 return ThreadIdInBlock;
94}
95
96uint32_t mapping::getWarpSize() { return __gpu_num_lanes(); }
97
98uint32_t mapping::getMaxTeamThreads(bool IsSPMD) {
99 uint32_t BlockSize = mapping::getNumberOfThreadsInBlock();
100 // If we are in SPMD mode, remove one warp.
101 return BlockSize - (!IsSPMD * mapping::getWarpSize());
102}
103uint32_t mapping::getMaxTeamThreads() {
104 return mapping::getMaxTeamThreads(mapping::isSPMDMode());
105}
106
107uint32_t mapping::getNumberOfThreadsInBlock(int32_t Dim) {
108 return __gpu_num_threads(Dim);
109}
110
111uint32_t mapping::getNumberOfThreadsInKernel() {
112 return mapping::getNumberOfThreadsInBlock(0) *
113 mapping::getNumberOfBlocksInKernel(0) *
114 mapping::getNumberOfThreadsInBlock(1) *
115 mapping::getNumberOfBlocksInKernel(1) *
116 mapping::getNumberOfThreadsInBlock(2) *
117 mapping::getNumberOfBlocksInKernel(2);
118}
119
120uint32_t mapping::getWarpIdInBlock() {
121 uint32_t WarpID =
122 mapping::getThreadIdInBlock(mapping::DIM_X) / mapping::getWarpSize();
123 ASSERT(WarpID < mapping::getNumberOfWarpsInBlock(), nullptr);
124 return WarpID;
125}
126
127uint32_t mapping::getBlockIdInKernel(int32_t Dim) {
128 uint32_t BlockId = __gpu_block_id(Dim);
129 ASSERT(BlockId < mapping::getNumberOfBlocksInKernel(Dim), nullptr);
130 return BlockId;
131}
132
133uint32_t mapping::getNumberOfWarpsInBlock() {
134 return (mapping::getNumberOfThreadsInBlock() + mapping::getWarpSize() - 1) /
135 mapping::getWarpSize();
136}
137
138uint32_t mapping::getNumberOfBlocksInKernel(int32_t Dim) {
139 return __gpu_num_blocks(Dim);
140}
141
142uint32_t mapping::getNumberOfProcessorElements() {
143 return static_cast<uint32_t>(config::getHardwareParallelism());
144}
145
146///}
147
148/// Execution mode
149///
150///{
151
152// TODO: This is a workaround for initialization coming from kernels outside of
153// the TU. We will need to solve this more correctly in the future.
154[[gnu::weak, clang::loader_uninitialized]] Local<int> IsSPMDMode;
155
156void mapping::init(bool IsSPMD) {
157 if (mapping::isInitialThreadInLevel0(IsSPMD))
158 IsSPMDMode = IsSPMD;
159}
160
161bool mapping::isSPMDMode() { return IsSPMDMode; }
162
163bool mapping::isGenericMode() { return !isSPMDMode(); }
164///}
165
166extern "C" {
167[[gnu::noinline]] uint32_t __kmpc_get_hardware_thread_id_in_block() {
168 return mapping::getThreadIdInBlock();
169}
170
171[[gnu::noinline]] uint32_t __kmpc_get_hardware_num_threads_in_block() {
172 return mapping::getNumberOfThreadsInBlock(mapping::DIM_X);
173}
174
175[[gnu::noinline]] uint32_t __kmpc_get_warp_size() {
176 return mapping::getWarpSize();
177}
178}
179
180#define _TGT_KERNEL_LANGUAGE(NAME, MAPPER_NAME) \
181 extern "C" int ompx_##NAME(int Dim) { return mapping::MAPPER_NAME(Dim); }
182
183_TGT_KERNEL_LANGUAGE(thread_id, getThreadIdInBlock)
184_TGT_KERNEL_LANGUAGE(block_id, getBlockIdInKernel)
185_TGT_KERNEL_LANGUAGE(block_dim, getNumberOfThreadsInBlock)
186_TGT_KERNEL_LANGUAGE(grid_dim, getNumberOfBlocksInKernel)
187
188extern "C" {
189uint64_t ompx_ballot_sync(uint64_t mask, int pred) {
190 return utils::ballotSync(mask, pred);
191}
192
193int ompx_shfl_down_sync_i(uint64_t mask, int var, unsigned delta, int width) {
194 return utils::shuffleDown(mask, var, delta, width);
195}
196
197float ompx_shfl_down_sync_f(uint64_t mask, float var, unsigned delta,
198 int width) {
199 return utils::bitCast<float>(
200 utils::shuffleDown(mask, utils::bitCast<int32_t>(var), delta, width));
201}
202
203long ompx_shfl_down_sync_l(uint64_t mask, long var, unsigned delta, int width) {
204 return utils::shuffleDown(mask, utils::bitCast<int64_t>(var), delta, width);
205}
206
207double ompx_shfl_down_sync_d(uint64_t mask, double var, unsigned delta,
208 int width) {
209 return utils::bitCast<double>(
210 utils::shuffleDown(mask, utils::bitCast<int64_t>(var), delta, width));
211}
212}
213

Provided by KDAB

Privacy Policy
Improve your Profiling and Debugging skills
Find out more

source code of offload/DeviceRTL/src/Mapping.cpp