| 1 | //===-- lib/cuda/kernel.cpp -------------------------------------*- C++ -*-===// |
| 2 | // |
| 3 | // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. |
| 4 | // See https://llvm.org/LICENSE.txt for license information. |
| 5 | // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception |
| 6 | // |
| 7 | //===----------------------------------------------------------------------===// |
| 8 | |
| 9 | #include "flang/Runtime/CUDA/kernel.h" |
| 10 | #include "flang-rt/runtime/descriptor.h" |
| 11 | #include "flang-rt/runtime/terminator.h" |
| 12 | #include "flang/Runtime/CUDA/common.h" |
| 13 | |
| 14 | #include "cuda_runtime.h" |
| 15 | |
| 16 | extern "C" { |
| 17 | |
| 18 | void RTDEF(CUFLaunchKernel)(const void *kernel, intptr_t gridX, intptr_t gridY, |
| 19 | intptr_t gridZ, intptr_t blockX, intptr_t blockY, intptr_t blockZ, |
| 20 | int64_t *stream, int32_t smem, void **params, void **extra) { |
| 21 | dim3 gridDim; |
| 22 | gridDim.x = gridX; |
| 23 | gridDim.y = gridY; |
| 24 | gridDim.z = gridZ; |
| 25 | dim3 blockDim; |
| 26 | blockDim.x = blockX; |
| 27 | blockDim.y = blockY; |
| 28 | blockDim.z = blockZ; |
| 29 | unsigned nbNegGridDim{0}; |
| 30 | if (gridX < 0) { |
| 31 | ++nbNegGridDim; |
| 32 | } |
| 33 | if (gridY < 0) { |
| 34 | ++nbNegGridDim; |
| 35 | } |
| 36 | if (gridZ < 0) { |
| 37 | ++nbNegGridDim; |
| 38 | } |
| 39 | if (nbNegGridDim == 1) { |
| 40 | int maxBlocks, nbBlocks, dev, multiProcCount; |
| 41 | cudaError_t err1, err2; |
| 42 | nbBlocks = blockDim.x * blockDim.y * blockDim.z; |
| 43 | cudaGetDevice(&dev); |
| 44 | err1 = cudaDeviceGetAttribute( |
| 45 | &multiProcCount, cudaDevAttrMultiProcessorCount, dev); |
| 46 | err2 = cudaOccupancyMaxActiveBlocksPerMultiprocessor( |
| 47 | &maxBlocks, kernel, nbBlocks, smem); |
| 48 | if (err1 == cudaSuccess && err2 == cudaSuccess) { |
| 49 | maxBlocks = multiProcCount * maxBlocks; |
| 50 | } |
| 51 | if (maxBlocks > 0) { |
| 52 | if (gridX > 0) { |
| 53 | maxBlocks = maxBlocks / gridDim.x; |
| 54 | } |
| 55 | if (gridY > 0) { |
| 56 | maxBlocks = maxBlocks / gridDim.y; |
| 57 | } |
| 58 | if (gridZ > 0) { |
| 59 | maxBlocks = maxBlocks / gridDim.z; |
| 60 | } |
| 61 | if (maxBlocks < 1) { |
| 62 | maxBlocks = 1; |
| 63 | } |
| 64 | if (gridX < 0) { |
| 65 | gridDim.x = maxBlocks; |
| 66 | } |
| 67 | if (gridY < 0) { |
| 68 | gridDim.y = maxBlocks; |
| 69 | } |
| 70 | if (gridZ < 0) { |
| 71 | gridDim.z = maxBlocks; |
| 72 | } |
| 73 | } |
| 74 | } else if (nbNegGridDim > 1) { |
| 75 | Fortran::runtime::Terminator terminator{__FILE__, __LINE__}; |
| 76 | terminator.Crash("Too many invalid grid dimensions" ); |
| 77 | } |
| 78 | cudaStream_t defaultStream = 0; |
| 79 | CUDA_REPORT_IF_ERROR(cudaLaunchKernel(kernel, gridDim, blockDim, params, smem, |
| 80 | stream != nullptr ? (cudaStream_t)(*stream) : defaultStream)); |
| 81 | } |
| 82 | |
| 83 | void RTDEF(CUFLaunchClusterKernel)(const void *kernel, intptr_t clusterX, |
| 84 | intptr_t clusterY, intptr_t clusterZ, intptr_t gridX, intptr_t gridY, |
| 85 | intptr_t gridZ, intptr_t blockX, intptr_t blockY, intptr_t blockZ, |
| 86 | int64_t *stream, int32_t smem, void **params, void **extra) { |
| 87 | cudaLaunchConfig_t config; |
| 88 | config.gridDim.x = gridX; |
| 89 | config.gridDim.y = gridY; |
| 90 | config.gridDim.z = gridZ; |
| 91 | config.blockDim.x = blockX; |
| 92 | config.blockDim.y = blockY; |
| 93 | config.blockDim.z = blockZ; |
| 94 | unsigned nbNegGridDim{0}; |
| 95 | if (gridX < 0) { |
| 96 | ++nbNegGridDim; |
| 97 | } |
| 98 | if (gridY < 0) { |
| 99 | ++nbNegGridDim; |
| 100 | } |
| 101 | if (gridZ < 0) { |
| 102 | ++nbNegGridDim; |
| 103 | } |
| 104 | if (nbNegGridDim == 1) { |
| 105 | int maxBlocks, nbBlocks, dev, multiProcCount; |
| 106 | cudaError_t err1, err2; |
| 107 | nbBlocks = config.blockDim.x * config.blockDim.y * config.blockDim.z; |
| 108 | cudaGetDevice(&dev); |
| 109 | err1 = cudaDeviceGetAttribute( |
| 110 | &multiProcCount, cudaDevAttrMultiProcessorCount, dev); |
| 111 | err2 = cudaOccupancyMaxActiveBlocksPerMultiprocessor( |
| 112 | &maxBlocks, kernel, nbBlocks, smem); |
| 113 | if (err1 == cudaSuccess && err2 == cudaSuccess) { |
| 114 | maxBlocks = multiProcCount * maxBlocks; |
| 115 | } |
| 116 | if (maxBlocks > 0) { |
| 117 | if (gridX > 0) { |
| 118 | maxBlocks = maxBlocks / config.gridDim.x; |
| 119 | } |
| 120 | if (gridY > 0) { |
| 121 | maxBlocks = maxBlocks / config.gridDim.y; |
| 122 | } |
| 123 | if (gridZ > 0) { |
| 124 | maxBlocks = maxBlocks / config.gridDim.z; |
| 125 | } |
| 126 | if (maxBlocks < 1) { |
| 127 | maxBlocks = 1; |
| 128 | } |
| 129 | if (gridX < 0) { |
| 130 | config.gridDim.x = maxBlocks; |
| 131 | } |
| 132 | if (gridY < 0) { |
| 133 | config.gridDim.y = maxBlocks; |
| 134 | } |
| 135 | if (gridZ < 0) { |
| 136 | config.gridDim.z = maxBlocks; |
| 137 | } |
| 138 | } |
| 139 | } else if (nbNegGridDim > 1) { |
| 140 | Fortran::runtime::Terminator terminator{__FILE__, __LINE__}; |
| 141 | terminator.Crash("Too many invalid grid dimensions" ); |
| 142 | } |
| 143 | config.dynamicSmemBytes = smem; |
| 144 | if (stream != nullptr) { |
| 145 | config.stream = (cudaStream_t)(*stream); |
| 146 | } else { |
| 147 | config.stream = 0; |
| 148 | } |
| 149 | cudaLaunchAttribute launchAttr[1]; |
| 150 | launchAttr[0].id = cudaLaunchAttributeClusterDimension; |
| 151 | launchAttr[0].val.clusterDim.x = clusterX; |
| 152 | launchAttr[0].val.clusterDim.y = clusterY; |
| 153 | launchAttr[0].val.clusterDim.z = clusterZ; |
| 154 | config.numAttrs = 1; |
| 155 | config.attrs = launchAttr; |
| 156 | CUDA_REPORT_IF_ERROR(cudaLaunchKernelExC(&config, kernel, params)); |
| 157 | } |
| 158 | |
| 159 | void RTDEF(CUFLaunchCooperativeKernel)(const void *kernel, intptr_t gridX, |
| 160 | intptr_t gridY, intptr_t gridZ, intptr_t blockX, intptr_t blockY, |
| 161 | intptr_t blockZ, int64_t *stream, int32_t smem, void **params, |
| 162 | void **extra) { |
| 163 | dim3 gridDim; |
| 164 | gridDim.x = gridX; |
| 165 | gridDim.y = gridY; |
| 166 | gridDim.z = gridZ; |
| 167 | dim3 blockDim; |
| 168 | blockDim.x = blockX; |
| 169 | blockDim.y = blockY; |
| 170 | blockDim.z = blockZ; |
| 171 | unsigned nbNegGridDim{0}; |
| 172 | if (gridX < 0) { |
| 173 | ++nbNegGridDim; |
| 174 | } |
| 175 | if (gridY < 0) { |
| 176 | ++nbNegGridDim; |
| 177 | } |
| 178 | if (gridZ < 0) { |
| 179 | ++nbNegGridDim; |
| 180 | } |
| 181 | if (nbNegGridDim == 1) { |
| 182 | int maxBlocks, nbBlocks, dev, multiProcCount; |
| 183 | cudaError_t err1, err2; |
| 184 | nbBlocks = blockDim.x * blockDim.y * blockDim.z; |
| 185 | cudaGetDevice(&dev); |
| 186 | err1 = cudaDeviceGetAttribute( |
| 187 | &multiProcCount, cudaDevAttrMultiProcessorCount, dev); |
| 188 | err2 = cudaOccupancyMaxActiveBlocksPerMultiprocessor( |
| 189 | &maxBlocks, kernel, nbBlocks, smem); |
| 190 | if (err1 == cudaSuccess && err2 == cudaSuccess) { |
| 191 | maxBlocks = multiProcCount * maxBlocks; |
| 192 | } |
| 193 | if (maxBlocks > 0) { |
| 194 | if (gridX > 0) { |
| 195 | maxBlocks = maxBlocks / gridDim.x; |
| 196 | } |
| 197 | if (gridY > 0) { |
| 198 | maxBlocks = maxBlocks / gridDim.y; |
| 199 | } |
| 200 | if (gridZ > 0) { |
| 201 | maxBlocks = maxBlocks / gridDim.z; |
| 202 | } |
| 203 | if (maxBlocks < 1) { |
| 204 | maxBlocks = 1; |
| 205 | } |
| 206 | if (gridX < 0) { |
| 207 | gridDim.x = maxBlocks; |
| 208 | } |
| 209 | if (gridY < 0) { |
| 210 | gridDim.y = maxBlocks; |
| 211 | } |
| 212 | if (gridZ < 0) { |
| 213 | gridDim.z = maxBlocks; |
| 214 | } |
| 215 | } |
| 216 | } else if (nbNegGridDim > 1) { |
| 217 | Fortran::runtime::Terminator terminator{__FILE__, __LINE__}; |
| 218 | terminator.Crash("Too many invalid grid dimensions" ); |
| 219 | } |
| 220 | cudaStream_t defaultStream = 0; |
| 221 | CUDA_REPORT_IF_ERROR(cudaLaunchCooperativeKernel(kernel, gridDim, blockDim, |
| 222 | params, smem, stream != nullptr ? (cudaStream_t)*stream : defaultStream)); |
| 223 | } |
| 224 | |
| 225 | } // extern "C" |
| 226 | |