| 1 | //===---- Reduction.cpp - OpenMP device reduction implementation - C++ -*-===// |
| 2 | // |
| 3 | // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. |
| 4 | // See https://llvm.org/LICENSE.txt for license information. |
| 5 | // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception |
| 6 | // |
| 7 | //===----------------------------------------------------------------------===// |
| 8 | // |
| 9 | // This file contains the implementation of reduction with KMPC interface. |
| 10 | // |
| 11 | //===----------------------------------------------------------------------===// |
| 12 | |
| 13 | #include "Debug.h" |
| 14 | #include "DeviceTypes.h" |
| 15 | #include "DeviceUtils.h" |
| 16 | #include "Interface.h" |
| 17 | #include "Mapping.h" |
| 18 | #include "State.h" |
| 19 | #include "Synchronization.h" |
| 20 | |
| 21 | using namespace ompx; |
| 22 | |
| 23 | namespace { |
| 24 | |
| 25 | void gpu_regular_warp_reduce(void *reduce_data, ShuffleReductFnTy shflFct) { |
| 26 | for (uint32_t mask = mapping::getWarpSize() / 2; mask > 0; mask /= 2) { |
| 27 | shflFct(reduce_data, /*LaneId - not used= */ 0, |
| 28 | /*Offset = */ mask, /*AlgoVersion=*/0); |
| 29 | } |
| 30 | } |
| 31 | |
| 32 | void gpu_irregular_warp_reduce(void *reduce_data, ShuffleReductFnTy shflFct, |
| 33 | uint32_t size, uint32_t tid) { |
| 34 | uint32_t curr_size; |
| 35 | uint32_t mask; |
| 36 | curr_size = size; |
| 37 | mask = curr_size / 2; |
| 38 | while (mask > 0) { |
| 39 | shflFct(reduce_data, /*LaneId = */ tid, /*Offset=*/mask, /*AlgoVersion=*/1); |
| 40 | curr_size = (curr_size + 1) / 2; |
| 41 | mask = curr_size / 2; |
| 42 | } |
| 43 | } |
| 44 | |
| 45 | static uint32_t gpu_irregular_simd_reduce(void *reduce_data, |
| 46 | ShuffleReductFnTy shflFct) { |
| 47 | uint32_t size, remote_id, physical_lane_id; |
| 48 | physical_lane_id = mapping::getThreadIdInBlock() % mapping::getWarpSize(); |
| 49 | __kmpc_impl_lanemask_t lanemask_lt = mapping::lanemaskLT(); |
| 50 | __kmpc_impl_lanemask_t Liveness = mapping::activemask(); |
| 51 | uint32_t logical_lane_id = utils::popc(Liveness & lanemask_lt) * 2; |
| 52 | __kmpc_impl_lanemask_t lanemask_gt = mapping::lanemaskGT(); |
| 53 | do { |
| 54 | Liveness = mapping::activemask(); |
| 55 | remote_id = utils::ffs(Liveness & lanemask_gt); |
| 56 | size = utils::popc(Liveness); |
| 57 | logical_lane_id /= 2; |
| 58 | shflFct(reduce_data, /*LaneId =*/logical_lane_id, |
| 59 | /*Offset=*/remote_id - 1 - physical_lane_id, /*AlgoVersion=*/2); |
| 60 | } while (logical_lane_id % 2 == 0 && size > 1); |
| 61 | return (logical_lane_id == 0); |
| 62 | } |
| 63 | |
| 64 | static int32_t nvptx_parallel_reduce_nowait(void *reduce_data, |
| 65 | ShuffleReductFnTy shflFct, |
| 66 | InterWarpCopyFnTy cpyFct) { |
| 67 | uint32_t BlockThreadId = mapping::getThreadIdInBlock(); |
| 68 | if (mapping::isMainThreadInGenericMode(/*IsSPMD=*/false)) |
| 69 | BlockThreadId = 0; |
| 70 | uint32_t NumThreads = omp_get_num_threads(); |
| 71 | if (NumThreads == 1) |
| 72 | return 1; |
| 73 | |
| 74 | // |
| 75 | // This reduce function handles reduction within a team. It handles |
| 76 | // parallel regions in both L1 and L2 parallelism levels. It also |
| 77 | // supports Generic, SPMD, and NoOMP modes. |
| 78 | // |
| 79 | // 1. Reduce within a warp. |
| 80 | // 2. Warp master copies value to warp 0 via shared memory. |
| 81 | // 3. Warp 0 reduces to a single value. |
| 82 | // 4. The reduced value is available in the thread that returns 1. |
| 83 | // |
| 84 | |
| 85 | #if __has_builtin(__nvvm_reflect) |
| 86 | if (__nvvm_reflect("__CUDA_ARCH" ) >= 700) { |
| 87 | uint32_t WarpsNeeded = |
| 88 | (NumThreads + mapping::getWarpSize() - 1) / mapping::getWarpSize(); |
| 89 | uint32_t WarpId = mapping::getWarpIdInBlock(); |
| 90 | |
| 91 | // Volta execution model: |
| 92 | // For the Generic execution mode a parallel region either has 1 thread and |
| 93 | // beyond that, always a multiple of 32. For the SPMD execution mode we may |
| 94 | // have any number of threads. |
| 95 | if ((NumThreads % mapping::getWarpSize() == 0) || |
| 96 | (WarpId < WarpsNeeded - 1)) |
| 97 | gpu_regular_warp_reduce(reduce_data, shflFct); |
| 98 | else if (NumThreads > 1) // Only SPMD execution mode comes thru this case. |
| 99 | gpu_irregular_warp_reduce( |
| 100 | reduce_data, shflFct, |
| 101 | /*LaneCount=*/NumThreads % mapping::getWarpSize(), |
| 102 | /*LaneId=*/mapping::getThreadIdInBlock() % mapping::getWarpSize()); |
| 103 | |
| 104 | // When we have more than [mapping::getWarpSize()] number of threads |
| 105 | // a block reduction is performed here. |
| 106 | // |
| 107 | // Only L1 parallel region can enter this if condition. |
| 108 | if (NumThreads > mapping::getWarpSize()) { |
| 109 | // Gather all the reduced values from each warp |
| 110 | // to the first warp. |
| 111 | cpyFct(reduce_data, WarpsNeeded); |
| 112 | |
| 113 | if (WarpId == 0) |
| 114 | gpu_irregular_warp_reduce(reduce_data, shflFct, WarpsNeeded, |
| 115 | BlockThreadId); |
| 116 | } |
| 117 | return BlockThreadId == 0; |
| 118 | } |
| 119 | #endif |
| 120 | __kmpc_impl_lanemask_t Liveness = mapping::activemask(); |
| 121 | if (Liveness == lanes::All) // Full warp |
| 122 | gpu_regular_warp_reduce(reduce_data, shflFct); |
| 123 | else if (!(Liveness & (Liveness + 1))) // Partial warp but contiguous lanes |
| 124 | gpu_irregular_warp_reduce(reduce_data, shflFct, |
| 125 | /*LaneCount=*/utils::popc(Liveness), |
| 126 | /*LaneId=*/mapping::getThreadIdInBlock() % |
| 127 | mapping::getWarpSize()); |
| 128 | else { // Dispersed lanes. Only threads in L2 |
| 129 | // parallel region may enter here; return |
| 130 | // early. |
| 131 | return gpu_irregular_simd_reduce(reduce_data, shflFct); |
| 132 | } |
| 133 | |
| 134 | // When we have more than [mapping::getWarpSize()] number of threads |
| 135 | // a block reduction is performed here. |
| 136 | // |
| 137 | // Only L1 parallel region can enter this if condition. |
| 138 | if (NumThreads > mapping::getWarpSize()) { |
| 139 | uint32_t WarpsNeeded = |
| 140 | (NumThreads + mapping::getWarpSize() - 1) / mapping::getWarpSize(); |
| 141 | // Gather all the reduced values from each warp |
| 142 | // to the first warp. |
| 143 | cpyFct(reduce_data, WarpsNeeded); |
| 144 | |
| 145 | uint32_t WarpId = BlockThreadId / mapping::getWarpSize(); |
| 146 | if (WarpId == 0) |
| 147 | gpu_irregular_warp_reduce(reduce_data, shflFct, WarpsNeeded, |
| 148 | BlockThreadId); |
| 149 | |
| 150 | return BlockThreadId == 0; |
| 151 | } |
| 152 | |
| 153 | // Get the OMP thread Id. This is different from BlockThreadId in the case |
| 154 | // of an L2 parallel region. |
| 155 | return BlockThreadId == 0; |
| 156 | } |
| 157 | |
| 158 | uint32_t roundToWarpsize(uint32_t s) { |
| 159 | if (s < mapping::getWarpSize()) |
| 160 | return 1; |
| 161 | return (s & ~(unsigned)(mapping::getWarpSize() - 1)); |
| 162 | } |
| 163 | |
| 164 | uint32_t kmpcMin(uint32_t x, uint32_t y) { return x < y ? x : y; } |
| 165 | |
| 166 | } // namespace |
| 167 | |
| 168 | extern "C" { |
| 169 | int32_t __kmpc_nvptx_parallel_reduce_nowait_v2(IdentTy *Loc, |
| 170 | uint64_t reduce_data_size, |
| 171 | void *reduce_data, |
| 172 | ShuffleReductFnTy shflFct, |
| 173 | InterWarpCopyFnTy cpyFct) { |
| 174 | return nvptx_parallel_reduce_nowait(reduce_data, shflFct, cpyFct); |
| 175 | } |
| 176 | |
| 177 | int32_t __kmpc_nvptx_teams_reduce_nowait_v2( |
| 178 | IdentTy *Loc, void *GlobalBuffer, uint32_t num_of_records, |
| 179 | uint64_t reduce_data_size, void *reduce_data, ShuffleReductFnTy shflFct, |
| 180 | InterWarpCopyFnTy cpyFct, ListGlobalFnTy lgcpyFct, ListGlobalFnTy lgredFct, |
| 181 | ListGlobalFnTy glcpyFct, ListGlobalFnTy glredFct) { |
| 182 | // Terminate all threads in non-SPMD mode except for the master thread. |
| 183 | uint32_t ThreadId = mapping::getThreadIdInBlock(); |
| 184 | if (mapping::isGenericMode()) { |
| 185 | if (!mapping::isMainThreadInGenericMode()) |
| 186 | return 0; |
| 187 | ThreadId = 0; |
| 188 | } |
| 189 | |
| 190 | uint32_t &IterCnt = state::getKernelLaunchEnvironment().ReductionIterCnt; |
| 191 | uint32_t &Cnt = state::getKernelLaunchEnvironment().ReductionCnt; |
| 192 | |
| 193 | // In non-generic mode all workers participate in the teams reduction. |
| 194 | // In generic mode only the team master participates in the teams |
| 195 | // reduction because the workers are waiting for parallel work. |
| 196 | uint32_t NumThreads = omp_get_num_threads(); |
| 197 | uint32_t TeamId = omp_get_team_num(); |
| 198 | uint32_t NumTeams = omp_get_num_teams(); |
| 199 | [[clang::loader_uninitialized]] static Local<unsigned> Bound; |
| 200 | [[clang::loader_uninitialized]] static Local<unsigned> ChunkTeamCount; |
| 201 | |
| 202 | // Block progress for teams greater than the current upper |
| 203 | // limit. We always only allow a number of teams less or equal |
| 204 | // to the number of slots in the buffer. |
| 205 | bool IsMaster = (ThreadId == 0); |
| 206 | while (IsMaster) { |
| 207 | Bound = atomic::load(&IterCnt, atomic::acquire); |
| 208 | if (TeamId < Bound + num_of_records) |
| 209 | break; |
| 210 | } |
| 211 | |
| 212 | if (IsMaster) { |
| 213 | int ModBockId = TeamId % num_of_records; |
| 214 | if (TeamId < num_of_records) { |
| 215 | lgcpyFct(GlobalBuffer, ModBockId, reduce_data); |
| 216 | } else |
| 217 | lgredFct(GlobalBuffer, ModBockId, reduce_data); |
| 218 | |
| 219 | // Propagate the memory writes above to the world. |
| 220 | fence::kernel(atomic::release); |
| 221 | |
| 222 | // Increment team counter. |
| 223 | // This counter is incremented by all teams in the current |
| 224 | // num_of_records chunk. |
| 225 | ChunkTeamCount = atomic::inc(&Cnt, num_of_records - 1u, atomic::seq_cst, |
| 226 | atomic::MemScopeTy::device); |
| 227 | } |
| 228 | |
| 229 | // Synchronize in SPMD mode as in generic mode all but 1 threads are in the |
| 230 | // state machine. |
| 231 | if (mapping::isSPMDMode()) |
| 232 | synchronize::threadsAligned(atomic::acq_rel); |
| 233 | |
| 234 | // reduce_data is global or shared so before being reduced within the |
| 235 | // warp we need to bring it in local memory: |
| 236 | // local_reduce_data = reduce_data[i] |
| 237 | // |
| 238 | // Example for 3 reduction variables a, b, c (of potentially different |
| 239 | // types): |
| 240 | // |
| 241 | // buffer layout (struct of arrays): |
| 242 | // a, a, ..., a, b, b, ... b, c, c, ... c |
| 243 | // |__________| |
| 244 | // num_of_records |
| 245 | // |
| 246 | // local_data_reduce layout (struct): |
| 247 | // a, b, c |
| 248 | // |
| 249 | // Each thread will have a local struct containing the values to be |
| 250 | // reduced: |
| 251 | // 1. do reduction within each warp. |
| 252 | // 2. do reduction across warps. |
| 253 | // 3. write the final result to the main reduction variable |
| 254 | // by returning 1 in the thread holding the reduction result. |
| 255 | |
| 256 | // Check if this is the very last team. |
| 257 | unsigned NumRecs = kmpcMin(x: NumTeams, y: uint32_t(num_of_records)); |
| 258 | if (ChunkTeamCount == NumTeams - Bound - 1) { |
| 259 | // Ensure we see the global memory writes by other teams |
| 260 | fence::kernel(atomic::acquire); |
| 261 | |
| 262 | // |
| 263 | // Last team processing. |
| 264 | // |
| 265 | if (ThreadId >= NumRecs) |
| 266 | return 0; |
| 267 | NumThreads = roundToWarpsize(s: kmpcMin(x: NumThreads, y: NumRecs)); |
| 268 | if (ThreadId >= NumThreads) |
| 269 | return 0; |
| 270 | |
| 271 | // Load from buffer and reduce. |
| 272 | glcpyFct(GlobalBuffer, ThreadId, reduce_data); |
| 273 | for (uint32_t i = NumThreads + ThreadId; i < NumRecs; i += NumThreads) |
| 274 | glredFct(GlobalBuffer, i, reduce_data); |
| 275 | |
| 276 | // Reduce across warps to the warp master. |
| 277 | if (NumThreads > 1) { |
| 278 | gpu_regular_warp_reduce(reduce_data, shflFct); |
| 279 | |
| 280 | // When we have more than [mapping::getWarpSize()] number of threads |
| 281 | // a block reduction is performed here. |
| 282 | uint32_t ActiveThreads = kmpcMin(x: NumRecs, y: NumThreads); |
| 283 | if (ActiveThreads > mapping::getWarpSize()) { |
| 284 | uint32_t WarpsNeeded = (ActiveThreads + mapping::getWarpSize() - 1) / |
| 285 | mapping::getWarpSize(); |
| 286 | // Gather all the reduced values from each warp |
| 287 | // to the first warp. |
| 288 | cpyFct(reduce_data, WarpsNeeded); |
| 289 | |
| 290 | uint32_t WarpId = ThreadId / mapping::getWarpSize(); |
| 291 | if (WarpId == 0) |
| 292 | gpu_irregular_warp_reduce(reduce_data, shflFct, WarpsNeeded, |
| 293 | ThreadId); |
| 294 | } |
| 295 | } |
| 296 | |
| 297 | if (IsMaster) { |
| 298 | Cnt = 0; |
| 299 | IterCnt = 0; |
| 300 | return 1; |
| 301 | } |
| 302 | return 0; |
| 303 | } |
| 304 | if (IsMaster && ChunkTeamCount == num_of_records - 1) { |
| 305 | // Allow SIZE number of teams to proceed writing their |
| 306 | // intermediate results to the global buffer. |
| 307 | atomic::add(&IterCnt, uint32_t(num_of_records), atomic::seq_cst); |
| 308 | } |
| 309 | |
| 310 | return 0; |
| 311 | } |
| 312 | } |
| 313 | |
| 314 | void *__kmpc_reduction_get_fixed_buffer() { |
| 315 | return state::getKernelLaunchEnvironment().ReductionBuffer; |
| 316 | } |
| 317 | |