1//===-- lib/cuda/kernel.cpp -------------------------------------*- C++ -*-===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8
9#include "flang/Runtime/CUDA/kernel.h"
10#include "flang-rt/runtime/descriptor.h"
11#include "flang-rt/runtime/terminator.h"
12#include "flang/Runtime/CUDA/common.h"
13
14#include "cuda_runtime.h"
15
16extern "C" {
17
18void RTDEF(CUFLaunchKernel)(const void *kernel, intptr_t gridX, intptr_t gridY,
19 intptr_t gridZ, intptr_t blockX, intptr_t blockY, intptr_t blockZ,
20 int64_t *stream, int32_t smem, void **params, void **extra) {
21 dim3 gridDim;
22 gridDim.x = gridX;
23 gridDim.y = gridY;
24 gridDim.z = gridZ;
25 dim3 blockDim;
26 blockDim.x = blockX;
27 blockDim.y = blockY;
28 blockDim.z = blockZ;
29 unsigned nbNegGridDim{0};
30 if (gridX < 0) {
31 ++nbNegGridDim;
32 }
33 if (gridY < 0) {
34 ++nbNegGridDim;
35 }
36 if (gridZ < 0) {
37 ++nbNegGridDim;
38 }
39 if (nbNegGridDim == 1) {
40 int maxBlocks, nbBlocks, dev, multiProcCount;
41 cudaError_t err1, err2;
42 nbBlocks = blockDim.x * blockDim.y * blockDim.z;
43 cudaGetDevice(&dev);
44 err1 = cudaDeviceGetAttribute(
45 &multiProcCount, cudaDevAttrMultiProcessorCount, dev);
46 err2 = cudaOccupancyMaxActiveBlocksPerMultiprocessor(
47 &maxBlocks, kernel, nbBlocks, smem);
48 if (err1 == cudaSuccess && err2 == cudaSuccess) {
49 maxBlocks = multiProcCount * maxBlocks;
50 }
51 if (maxBlocks > 0) {
52 if (gridX > 0) {
53 maxBlocks = maxBlocks / gridDim.x;
54 }
55 if (gridY > 0) {
56 maxBlocks = maxBlocks / gridDim.y;
57 }
58 if (gridZ > 0) {
59 maxBlocks = maxBlocks / gridDim.z;
60 }
61 if (maxBlocks < 1) {
62 maxBlocks = 1;
63 }
64 if (gridX < 0) {
65 gridDim.x = maxBlocks;
66 }
67 if (gridY < 0) {
68 gridDim.y = maxBlocks;
69 }
70 if (gridZ < 0) {
71 gridDim.z = maxBlocks;
72 }
73 }
74 } else if (nbNegGridDim > 1) {
75 Fortran::runtime::Terminator terminator{__FILE__, __LINE__};
76 terminator.Crash("Too many invalid grid dimensions");
77 }
78 cudaStream_t defaultStream = 0;
79 CUDA_REPORT_IF_ERROR(cudaLaunchKernel(kernel, gridDim, blockDim, params, smem,
80 stream != nullptr ? (cudaStream_t)(*stream) : defaultStream));
81}
82
83void RTDEF(CUFLaunchClusterKernel)(const void *kernel, intptr_t clusterX,
84 intptr_t clusterY, intptr_t clusterZ, intptr_t gridX, intptr_t gridY,
85 intptr_t gridZ, intptr_t blockX, intptr_t blockY, intptr_t blockZ,
86 int64_t *stream, int32_t smem, void **params, void **extra) {
87 cudaLaunchConfig_t config;
88 config.gridDim.x = gridX;
89 config.gridDim.y = gridY;
90 config.gridDim.z = gridZ;
91 config.blockDim.x = blockX;
92 config.blockDim.y = blockY;
93 config.blockDim.z = blockZ;
94 unsigned nbNegGridDim{0};
95 if (gridX < 0) {
96 ++nbNegGridDim;
97 }
98 if (gridY < 0) {
99 ++nbNegGridDim;
100 }
101 if (gridZ < 0) {
102 ++nbNegGridDim;
103 }
104 if (nbNegGridDim == 1) {
105 int maxBlocks, nbBlocks, dev, multiProcCount;
106 cudaError_t err1, err2;
107 nbBlocks = config.blockDim.x * config.blockDim.y * config.blockDim.z;
108 cudaGetDevice(&dev);
109 err1 = cudaDeviceGetAttribute(
110 &multiProcCount, cudaDevAttrMultiProcessorCount, dev);
111 err2 = cudaOccupancyMaxActiveBlocksPerMultiprocessor(
112 &maxBlocks, kernel, nbBlocks, smem);
113 if (err1 == cudaSuccess && err2 == cudaSuccess) {
114 maxBlocks = multiProcCount * maxBlocks;
115 }
116 if (maxBlocks > 0) {
117 if (gridX > 0) {
118 maxBlocks = maxBlocks / config.gridDim.x;
119 }
120 if (gridY > 0) {
121 maxBlocks = maxBlocks / config.gridDim.y;
122 }
123 if (gridZ > 0) {
124 maxBlocks = maxBlocks / config.gridDim.z;
125 }
126 if (maxBlocks < 1) {
127 maxBlocks = 1;
128 }
129 if (gridX < 0) {
130 config.gridDim.x = maxBlocks;
131 }
132 if (gridY < 0) {
133 config.gridDim.y = maxBlocks;
134 }
135 if (gridZ < 0) {
136 config.gridDim.z = maxBlocks;
137 }
138 }
139 } else if (nbNegGridDim > 1) {
140 Fortran::runtime::Terminator terminator{__FILE__, __LINE__};
141 terminator.Crash("Too many invalid grid dimensions");
142 }
143 config.dynamicSmemBytes = smem;
144 if (stream != nullptr) {
145 config.stream = (cudaStream_t)(*stream);
146 } else {
147 config.stream = 0;
148 }
149 cudaLaunchAttribute launchAttr[1];
150 launchAttr[0].id = cudaLaunchAttributeClusterDimension;
151 launchAttr[0].val.clusterDim.x = clusterX;
152 launchAttr[0].val.clusterDim.y = clusterY;
153 launchAttr[0].val.clusterDim.z = clusterZ;
154 config.numAttrs = 1;
155 config.attrs = launchAttr;
156 CUDA_REPORT_IF_ERROR(cudaLaunchKernelExC(&config, kernel, params));
157}
158
159void RTDEF(CUFLaunchCooperativeKernel)(const void *kernel, intptr_t gridX,
160 intptr_t gridY, intptr_t gridZ, intptr_t blockX, intptr_t blockY,
161 intptr_t blockZ, int64_t *stream, int32_t smem, void **params,
162 void **extra) {
163 dim3 gridDim;
164 gridDim.x = gridX;
165 gridDim.y = gridY;
166 gridDim.z = gridZ;
167 dim3 blockDim;
168 blockDim.x = blockX;
169 blockDim.y = blockY;
170 blockDim.z = blockZ;
171 unsigned nbNegGridDim{0};
172 if (gridX < 0) {
173 ++nbNegGridDim;
174 }
175 if (gridY < 0) {
176 ++nbNegGridDim;
177 }
178 if (gridZ < 0) {
179 ++nbNegGridDim;
180 }
181 if (nbNegGridDim == 1) {
182 int maxBlocks, nbBlocks, dev, multiProcCount;
183 cudaError_t err1, err2;
184 nbBlocks = blockDim.x * blockDim.y * blockDim.z;
185 cudaGetDevice(&dev);
186 err1 = cudaDeviceGetAttribute(
187 &multiProcCount, cudaDevAttrMultiProcessorCount, dev);
188 err2 = cudaOccupancyMaxActiveBlocksPerMultiprocessor(
189 &maxBlocks, kernel, nbBlocks, smem);
190 if (err1 == cudaSuccess && err2 == cudaSuccess) {
191 maxBlocks = multiProcCount * maxBlocks;
192 }
193 if (maxBlocks > 0) {
194 if (gridX > 0) {
195 maxBlocks = maxBlocks / gridDim.x;
196 }
197 if (gridY > 0) {
198 maxBlocks = maxBlocks / gridDim.y;
199 }
200 if (gridZ > 0) {
201 maxBlocks = maxBlocks / gridDim.z;
202 }
203 if (maxBlocks < 1) {
204 maxBlocks = 1;
205 }
206 if (gridX < 0) {
207 gridDim.x = maxBlocks;
208 }
209 if (gridY < 0) {
210 gridDim.y = maxBlocks;
211 }
212 if (gridZ < 0) {
213 gridDim.z = maxBlocks;
214 }
215 }
216 } else if (nbNegGridDim > 1) {
217 Fortran::runtime::Terminator terminator{__FILE__, __LINE__};
218 terminator.Crash("Too many invalid grid dimensions");
219 }
220 cudaStream_t defaultStream = 0;
221 CUDA_REPORT_IF_ERROR(cudaLaunchCooperativeKernel(kernel, gridDim, blockDim,
222 params, smem, stream != nullptr ? (cudaStream_t)*stream : defaultStream));
223}
224
225} // extern "C"
226

source code of flang-rt/lib/cuda/kernel.cpp